In [3]:
# libraries for EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#To display the plots
%matplotlib inline

# scikit learn library for machine learning algorithms, data preprocessing, and evaluation
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
#from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV, RandomizedSearchCV, cross_validate, cross_val_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, log_loss, recall_score, accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import make_scorer, average_precision_score, PrecisionRecallDisplay, precision_recall_curve, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer
from sklearn.pipeline import Pipeline

import scikitplot as skplt

# yellowbrick library for visualizing the model performance
from yellowbrick.classifier import ConfusionMatrix, PrecisionRecallCurve
from yellowbrick.classifier import PrecisionRecallCurve
from sklearn.pipeline import Pipeline

# to get rid of the warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

plt.rcParams["figure.figsize"] = (10,6)
sns.set_style("whitegrid", {'axes.grid' : True})

# To display maximum columns
pd.set_option('display.max_columns', 50)

# To display maximum rows
pd.set_option('display.max_rows', 50)

# To set float format
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
def eval_metric(model, X, y, test):   
    y_pred = model.predict(test)
    y_train_pred = model.predict(X)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_x.csv')

df = train.copy()

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = train.drop(columns=["Obek İsmi"])
y = train["Obek İsmi"]
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
clf.predict(test)


In [6]:
from sklearn.model_selection import train_test_split

X = df.drop("Öbek İsmi", axis=1)
y = df["Öbek İsmi"]

X_test=test
y_test=test

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [11]:
cat = X.select_dtypes("object").columns
cat 

Index(['Cinsiyet', 'Yaş Grubu', 'Medeni Durum', 'Eğitim Düzeyi',
       'İstihdam Durumu', 'Yaşadığı Şehir', 'En Çok İlgilendiği Ürün Grubu',
       'Eğitime Devam Etme Durumu'],
      dtype='object')

In [12]:
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', 
                         unknown_value=-1)

column_trans = make_column_transformer((ord_enc, cat), 
                                       remainder='passthrough')

In [13]:
import lightgbm as lgb

operations_lightgbm = [("OrdinalEncoder", column_trans), 
                       ("lightgbm_model", lgb.LGBMClassifier(random_state=42))]

lightgbm_model = Pipeline(steps=operations_lightgbm)

lightgbm_model.fit(X, y)

In [14]:
lightgbm_model.predict(test)

array(['obek_3', 'obek_3', 'obek_2', ..., 'obek_7', 'obek_5', 'obek_6'],
      dtype=object)

In [9]:
eval_metric(lightgbm_model, X, y, X_test, y_test)

Test_Set
[[131   1   3   0   0   0   1   2]
 [  0 104   2   0   0   0   1   2]
 [  0   0 135   0   1   3   0   0]
 [  1   0   2 137   0   1   2   1]
 [  1   1   1   1 134   2   0   1]
 [  1   1   0   0   0 133   2   1]
 [  1   1   1   0   0   1 135   0]
 [  0   0   2   2   0   0   2 138]]
              precision    recall  f1-score   support

      obek_1       0.97      0.95      0.96       138
      obek_2       0.96      0.95      0.96       109
      obek_3       0.92      0.97      0.95       139
      obek_4       0.98      0.95      0.96       144
      obek_5       0.99      0.95      0.97       141
      obek_6       0.95      0.96      0.96       138
      obek_7       0.94      0.97      0.96       139
      obek_8       0.95      0.96      0.96       144

    accuracy                           0.96      1092
   macro avg       0.96      0.96      0.96      1092
weighted avg       0.96      0.96      0.96      1092


Train_Set
[[554   0   0   0   0   0   0   0]
 [  0 438   0

In [10]:
from sklearn.metrics import accuracy_score

y_pred = lightgbm_model.predict(X_test)
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

LightGBM Model accuracy score: 0.9588


In [32]:
param_grid = {
    'lightgbm_model__boosting_type': ['gbdt', 'dart', 'goss'],  # Gradient boosting türü
    'lightgbm_model__num_leaves': [30, 50, 70],  # Karar düğümlerinin sayısı
    'lightgbm_model__learning_rate': [0.01, 0.1, 0.2],  # Öğrenme oranı
    'lightgbm_model__n_estimators': [50, 100, 150],  # Tahmin edici (estimator) sayısı
    'lightgbm_model__subsample_for_bin': [20000, 30000, 40000],  # Bölmeler için örnek sayısı
    'lightgbm_model__min_child_samples': [20, 30, 50],  # Her yaprakta minimum örnek sayısı
    'lightgbm_model__reg_alpha': [0, 0.1, 0.5],  # L1 düzenlemesi (alpha)
    'lightgbm_model__reg_lambda': [0, 0.1, 0.5],  # L2 düzenlemesi (lambda)
    'lightgbm_model__colsample_bytree': [0.6, 0.8, 1.0],  # Ağaç oluştururken sütun örnekleme oranı
    'lightgbm_model__is_unbalance': [True, False],  # Dengesiz veri kümesi için kullanılır mı?
    'lightgbm_model__scale_pos_weight': [1, 2, 4]  # Pozitif sınıf ağırlığı
}

operations_lightgbm = [("OrdinalEncoder", column_trans), 
                       ("lightgbm_model", lgb.LGBMClassifier(random_state=42))]

model = Pipeline(steps=operations_lightgbm)

lgbm_grid_model = HalvingGridSearchCV(model,
                               param_grid,
                               scoring="accuracy", 
                               cv=5,
                               # n_iter=50,
                               factor=3,
                               n_jobs=-1,
                               verbose=2,
                               return_train_score=True)

In [33]:
%%time

lgbm_grid_model.fit(X_train, y_train)

n_iterations: 4
n_required_iterations: 11
n_possible_iterations: 4
min_resources_: 80
max_resources_: 4368
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 118098
n_resources: 80
Fitting 5 folds for each of 118098 candidates, totalling 590490 fits
----------
iter: 1
n_candidates: 39366
n_resources: 240
Fitting 5 folds for each of 39366 candidates, totalling 196830 fits
----------
iter: 2
n_candidates: 13122
n_resources: 720
Fitting 5 folds for each of 13122 candidates, totalling 65610 fits
----------
iter: 3
n_candidates: 4374
n_resources: 2160
Fitting 5 folds for each of 4374 candidates, totalling 21870 fits
Wall time: 2h 11min 51s


In [34]:
eval_metric(lgbm_grid_model, X_train, y_train, X_test, y_test)

Test_Set
[[131   1   3   0   0   0   1   2]
 [  0 104   2   0   0   0   1   2]
 [  0   0 135   0   1   3   0   0]
 [  1   0   2 137   0   1   2   1]
 [  1   1   1   1 134   2   0   1]
 [  1   1   0   0   0 133   2   1]
 [  1   1   1   0   0   1 135   0]
 [  0   0   2   2   0   0   2 138]]
              precision    recall  f1-score   support

      obek_1       0.97      0.95      0.96       138
      obek_2       0.96      0.95      0.96       109
      obek_3       0.92      0.97      0.95       139
      obek_4       0.98      0.95      0.96       144
      obek_5       0.99      0.95      0.97       141
      obek_6       0.95      0.96      0.96       138
      obek_7       0.94      0.97      0.96       139
      obek_8       0.95      0.96      0.96       144

    accuracy                           0.96      1092
   macro avg       0.96      0.96      0.96      1092
weighted avg       0.96      0.96      0.96      1092


Train_Set
[[534   1   4   6   3   3   2   1]
 [  3 430   1

In [35]:
y_pred = lgbm_grid_model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

LightGBM Model accuracy score: 0.9588


In [39]:
lgbm_grid_model.best_params_

{'lightgbm_model__boosting_type': 'dart',
 'lightgbm_model__colsample_bytree': 0.6,
 'lightgbm_model__is_unbalance': True,
 'lightgbm_model__learning_rate': 0.1,
 'lightgbm_model__min_child_samples': 30,
 'lightgbm_model__n_estimators': 100,
 'lightgbm_model__num_leaves': 70,
 'lightgbm_model__reg_alpha': 0,
 'lightgbm_model__reg_lambda': 0.5,
 'lightgbm_model__scale_pos_weight': 4,
 'lightgbm_model__subsample_for_bin': 30000}

In [15]:
predict = lgbm_grid_model.predict(test)

NameError: name 'lgbm_grid_model' is not defined

In [16]:
predict=lightgbm_model.predict(test)

In [17]:
data = {
    'id': range(2340),
    'Öbek İsmi': predict
}
sub_df = pd.DataFrame(data)
sub_df.set_index('id', inplace=True)
sub_df

Unnamed: 0_level_0,Öbek İsmi
id,Unnamed: 1_level_1
0,obek_3
1,obek_3
2,obek_2
3,obek_6
4,obek_1
...,...
2335,obek_8
2336,obek_2
2337,obek_7
2338,obek_5


In [None]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(test)
accuracy = accuracy_score(true_labels, y_pred)


In [37]:
data = {
    'id': range(2340),
    'Öbek İsmi': predict
}
sub_df = pd.DataFrame(data)
sub_df.set_index('id', inplace=True)
sub_df

Unnamed: 0_level_0,Öbek İsmi
id,Unnamed: 1_level_1
0,obek_3
1,obek_3
2,obek_2
3,obek_6
4,obek_1
...,...
2335,obek_8
2336,obek_2
2337,obek_7
2338,obek_5


In [38]:
sub_df.to_csv('submission_lgbm.csv')