In [2]:
# libraries for EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#To display the plots
%matplotlib inline

# scikit learn library for machine learning algorithms, data preprocessing, and evaluation
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, log_loss, recall_score, accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import make_scorer, average_precision_score, PrecisionRecallDisplay, precision_recall_curve, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer
from sklearn.pipeline import Pipeline

import scikitplot as skplt

# yellowbrick library for visualizing the model performance
from yellowbrick.classifier import ConfusionMatrix, PrecisionRecallCurve
from yellowbrick.classifier import PrecisionRecallCurve
from sklearn.pipeline import Pipeline

# to get rid of the warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

plt.rcParams["figure.figsize"] = (10,6)
sns.set_style("whitegrid", {'axes.grid' : True})

# To display maximum columns
pd.set_option('display.max_columns', 50)

# To display maximum rows
pd.set_option('display.max_rows', 50)

# To set float format
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
def eval_metric(model, X_train, y_train, X_test, y_test):   
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_x.csv')

In [3]:
train["Cinsiyet"] = train["Cinsiyet"].replace(["Erkek"], 1)
train["Cinsiyet"] = train["Cinsiyet"].replace(["Kadın"], 0)

In [4]:
train = pd.get_dummies(train, columns=['Yaş Grubu',
                                       'Medeni Durum',
                                       'Eğitim Düzeyi',
                                       'Yaşadığı Şehir', 
                                       'İstihdam Durumu',
                                       'Eğitime Devam Etme Durumu', 
                                       'En Çok İlgilendiği Ürün Grubu'], dtype=int)

In [8]:
test["Cinsiyet"] =test["Cinsiyet"].replace(["Erkek"],1)
test["Cinsiyet"] =test["Cinsiyet"].replace(["Kadın"],0)
test = pd.get_dummies(test, columns=['Yaş Grubu',
                                     'Medeni Durum',
                                     'Eğitim Düzeyi',
                                     'Yaşadığı Şehir', 
                                     'İstihdam Durumu',
                                     'Eğitime Devam Etme Durumu', 
                                     'En Çok İlgilendiği Ürün Grubu'], dtype=int)

In [10]:
X_train = train.drop(['Öbek İsmi'], axis=1)
y_train = train['Öbek İsmi']

xgb_model = XGBClassifier(random_state=42)

y_train_xgb = y_train.map({'obek_1':0, 'obek_2':1, 'obek_3':2,
                           'obek_4':3, 'obek_5':4, 'obek_6':5,
                           'obek_7':6, 'obek_8':7}) # sıralama classification_report ile aynı olacak.

xgb_model.fit(X_train, y_train_xgb)

In [11]:
predict = xgb_model.predict(test)

In [12]:
predict

array([2, 2, 1, ..., 6, 4, 5], dtype=int64)

# ------------------------------

In [16]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_x.csv')

df = train.copy()

In [17]:
from sklearn.model_selection import train_test_split

X = df.drop("Öbek İsmi", axis=1)
y = df["Öbek İsmi"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [17]:
from flaml import AutoML

automl = AutoML()
settings = {
    'time_budget': 240,
    'metric': 'accuracy',
    'task': 'classification',
    'seed': 42
}

automl.fit(X_train, y_train, **settings)

[flaml.automl.logger: 08-23 21:11:22] {1693} INFO - task = classification
[flaml.automl.logger: 08-23 21:11:22] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 08-23 21:11:22] {1703} INFO - Evaluation method: cv
[flaml.automl.logger: 08-23 21:11:22] {1801} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-23 21:11:22] {1911} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 08-23 21:11:22] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 08-23 21:11:22] {2347} INFO - Estimated sufficient time budget=1800s. Estimated necessary time budget=44s.
[flaml.automl.logger: 08-23 21:11:22] {2394} INFO -  at 0.2s,	estimator lgbm's best error=0.0602,	best estimator lgbm's best error=0.0602
[flaml.automl.logger: 08-23 21:11:22] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-23 21:11:22] {2394} INFO -  at 0.4s,	estimator lgbm's

In [18]:
# Predict
print(automl.predict_proba(X_train))
# Print the best model
print(automl.model.estimator)

[[3.84720525e-03 1.58843930e-02 1.77434235e-02 ... 9.27205857e-01
  1.74749380e-02 1.65406812e-03]
 [1.10375292e-02 6.21260458e-03 4.90752675e-03 ... 9.68316296e-03
  3.70239459e-02 6.24548940e-03]
 [3.51838830e-02 1.06558899e-02 8.97563120e-01 ... 8.71580466e-03
  2.38842115e-02 3.16115924e-03]
 ...
 [2.18393444e-02 1.97329248e-02 3.77539597e-02 ... 3.12872505e-02
  8.57421017e-01 3.27162258e-03]
 [9.46562420e-03 9.57313430e-01 9.19319353e-04 ... 6.11695000e-03
  3.96958080e-03 9.63771442e-03]
 [2.31475395e-03 1.36531118e-02 4.40067878e-03 ... 4.76707287e-03
  8.22442836e-03 9.41853861e-01]]
ExtraTreesClassifier(criterion='entropy', max_features=0.45450215891045476,
                     max_leaf_nodes=54, n_estimators=42, n_jobs=-1,
                     random_state=12032022)


In [19]:
'''retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: extra_tree
Best hyperparmeter config: {'n_estimators': 42, 'max_features': 0.45450215891045476, 'max_leaves': 54, 'criterion': 'entropy'}
Best accuracy on validation data: 0.9538
Training duration of best run: 0.09202 s


In [20]:
automl.model.estimator

In [21]:
'''compute predictions of testing dataset''' 
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
y_pred_proba = automl.predict_proba(X_test)[:,1]

Predicted labels ['obek_1' 'obek_1' 'obek_4' ... 'obek_5' 'obek_3' 'obek_3']
True labels 1389    obek_1
4374    obek_1
5369    obek_4
4639    obek_6
2624    obek_4
         ...  
5113    obek_5
4852    obek_5
427     obek_5
4468    obek_3
5416    obek_3
Name: Öbek İsmi, Length: 1092, dtype: object


In [22]:
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))

accuracy = 0.9587912087912088


ValueError: multi_class must be in ('ovo', 'ovr')

In [25]:
eval_metric(automl, X_train, y_train, X_test, y_test)

Test_Set
[[131   1   3   0   0   0   1   2]
 [  0 104   2   0   0   0   1   2]
 [  0   0 135   0   1   3   0   0]
 [  1   0   2 137   0   1   2   1]
 [  1   1   1   1 134   2   0   1]
 [  1   1   0   0   0 133   2   1]
 [  1   1   1   0   0   1 135   0]
 [  0   0   2   2   0   0   2 138]]
              precision    recall  f1-score   support

      obek_1       0.97      0.95      0.96       138
      obek_2       0.96      0.95      0.96       109
      obek_3       0.92      0.97      0.95       139
      obek_4       0.98      0.95      0.96       144
      obek_5       0.99      0.95      0.97       141
      obek_6       0.95      0.96      0.96       138
      obek_7       0.94      0.97      0.96       139
      obek_8       0.95      0.96      0.96       144

    accuracy                           0.96      1092
   macro avg       0.96      0.96      0.96      1092
weighted avg       0.96      0.96      0.96      1092


Train_Set
[[527   1   5   8   4   3   5   1]
 [  3 412   2

# ------------------------------

In [53]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_x.csv')

df = train.copy()

In [54]:
from sklearn.model_selection import train_test_split

X = df.drop(["Öbek İsmi", 'Yıllık Ortalama Sepete Atılan Ürün Adedi', 'Eğitime Devam Etme Durumu'], axis=1)
y = df["Öbek İsmi"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [57]:
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', 
                         unknown_value=-1)

column_trans = make_column_transformer((ord_enc, cat), 
                                       remainder='passthrough')

In [58]:
operations_xgb = [("OrdinalEncoder", column_trans), 
                  ("XGB_model", XGBClassifier(random_state=42))]

xgb_model = Pipeline(steps=operations_xgb)

y_train_xgb = y_train.map({'obek_1':0, 'obek_2':1, 'obek_3':2,
                           'obek_4':3, 'obek_5':4, 'obek_6':5,
                           'obek_7':6, 'obek_8':7}) 

y_test_xgb = y_test.map({'obek_1':0, 'obek_2':1, 'obek_3':2,
                         'obek_4':3, 'obek_5':4, 'obek_6':5,
                         'obek_7':6, 'obek_8':7})

xgb_model.fit(X_train, y_train_xgb)

In [59]:
eval_metric(xgb_model, X_train, y_train_xgb, X_test, y_test_xgb)

Test_Set
[[132   1   2   0   0   0   1   2]
 [  1  99   2   1   2   1   1   2]
 [  0   0 135   0   1   3   0   0]
 [  1   0   2 137   0   1   2   1]
 [  2   4   1   1 130   2   0   1]
 [  1   1   0   0   0 133   2   1]
 [  1   1   1   0   0   1 135   0]
 [  0   0   2   2   0   0   2 138]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       138
           1       0.93      0.91      0.92       109
           2       0.93      0.97      0.95       139
           3       0.97      0.95      0.96       144
           4       0.98      0.92      0.95       141
           5       0.94      0.96      0.95       138
           6       0.94      0.97      0.96       139
           7       0.95      0.96      0.96       144

    accuracy                           0.95      1092
   macro avg       0.95      0.95      0.95      1092
weighted avg       0.95      0.95      0.95      1092


Train_Set
[[554   0   0   0   0   0   0   0]
 [  0 438   0

### GridSearchCV

In [60]:
param_grid = {
              "XGB_model__n_estimators":[20, 50, 100],
              'XGB_model__n_max_depth':[3, 5, 7],
              "XGB_model__learning_rate": [0.03, 0.05, 0.1, 0.3]
}

operations_xgb = [("OrdinalEncoder", column_trans), 
                  ("XGB_model", XGBClassifier(random_state=42))]

model = Pipeline(steps=operations_xgb)

xgb_grid_model = GridSearchCV(model,
                              param_grid,
                              scoring="accuracy", 
                              cv=5,
                              n_jobs=-1,
                              return_train_score=True)

xgb_grid_model.fit(X_train, y_train_xgb)

Parameters: { "n_max_depth" } are not used.



In [61]:
eval_metric(xgb_grid_model, X_train, y_train_xgb, X_test, y_test_xgb)

Test_Set
[[132   1   2   0   0   0   1   2]
 [  2  99   2   0   2   1   1   2]
 [  0   0 135   0   1   3   0   0]
 [  1   0   2 136   0   2   2   1]
 [  2   4   1   1 130   2   0   1]
 [  1   1   0   0   0 133   2   1]
 [  1   1   1   0   0   1 135   0]
 [  0   0   2   2   0   0   2 138]]
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       138
           1       0.93      0.91      0.92       109
           2       0.93      0.97      0.95       139
           3       0.98      0.94      0.96       144
           4       0.98      0.92      0.95       141
           5       0.94      0.96      0.95       138
           6       0.94      0.97      0.96       139
           7       0.95      0.96      0.96       144

    accuracy                           0.95      1092
   macro avg       0.95      0.95      0.95      1092
weighted avg       0.95      0.95      0.95      1092


Train_Set
[[553   0   0   0   0   0   1   0]
 [  0 438   0

# ------------------------------

In [18]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification

In [19]:
operations_etc = [("OrdinalEncoder", column_trans), 
                  ("ETC_model", ExtraTreesClassifier(random_state=42))]

In [20]:
model = Pipeline(steps=operations_etc)

In [21]:
model.fit(X_train, y_train)

In [22]:
eval_metric(model, X_train, y_train, X_test, y_test)

Test_Set
[[132   1   2   0   0   0   1   2]
 [  0 104   2   0   0   0   1   2]
 [  0   0 135   0   1   3   0   0]
 [  1   0   2 137   0   1   2   1]
 [  1   1   1   1 134   2   0   1]
 [  1   1   0   0   0 133   2   1]
 [  1   1   1   0   0   1 135   0]
 [  0   0   2   2   0   0   2 138]]
              precision    recall  f1-score   support

      obek_1       0.97      0.96      0.96       138
      obek_2       0.96      0.95      0.96       109
      obek_3       0.93      0.97      0.95       139
      obek_4       0.98      0.95      0.96       144
      obek_5       0.99      0.95      0.97       141
      obek_6       0.95      0.96      0.96       138
      obek_7       0.94      0.97      0.96       139
      obek_8       0.95      0.96      0.96       144

    accuracy                           0.96      1092
   macro avg       0.96      0.96      0.96      1092
weighted avg       0.96      0.96      0.96      1092


Train_Set
[[554   0   0   0   0   0   0   0]
 [  0 438   0

### GridSearchCV

In [25]:
param_grid = {
    'ETC_model__n_estimators': [50, 100, 200],                  # Ağaç sayısı
    'ETC_model__criterion': ['gini', 'entropy'],                 # Bölme kriteri
    'ETC_model__max_depth': [None, 10, 20, 30],                  # Maksimum derinlik
    'ETC_model__min_samples_split': [2, 5, 10],                  # Bir düğümü bölmek için gereken minimum örnek sayısı
    'ETC_model__min_samples_leaf': [1, 2, 4],                    # Bir yaprak düğümünde olması gereken minimum örnek sayısı
    'ETC_model__max_features': ['auto', 'sqrt', 'log2'],         # Her ağaç için kullanılacak maksimum özellik sayısı
    'ETC_model__bootstrap': [True, False],                       # Örnekleme yapılıp yapılmayacağı
    'ETC_model__class_weight': [None, 'balanced', 'balanced_subsample']  # Sınıf ağırlıkları
}

operations_etc = [("OrdinalEncoder", column_trans), 
                  ("ETC_model", ExtraTreesClassifier(random_state=42))]

model = Pipeline(steps=operations_etc)

etc_grid_model = GridSearchCV(model,
                               param_grid,
                               scoring="accuracy", 
                               cv=5,
                               n_jobs=-1,
                               return_train_score=True)

etc_grid_model.fit(X_train, y_train)

In [26]:
eval_metric(etc_grid_model, X_train, y_train, X_test, y_test)

Test_Set
[[132   1   2   0   0   0   1   2]
 [  0 104   2   0   0   0   1   2]
 [  0   0 135   0   1   3   0   0]
 [  1   0   2 137   0   1   2   1]
 [  1   1   1   1 134   2   0   1]
 [  1   1   0   0   0 133   2   1]
 [  1   1   1   0   0   1 135   0]
 [  0   0   2   2   0   0   2 138]]
              precision    recall  f1-score   support

      obek_1       0.97      0.96      0.96       138
      obek_2       0.96      0.95      0.96       109
      obek_3       0.93      0.97      0.95       139
      obek_4       0.98      0.95      0.96       144
      obek_5       0.99      0.95      0.97       141
      obek_6       0.95      0.96      0.96       138
      obek_7       0.94      0.97      0.96       139
      obek_8       0.95      0.96      0.96       144

    accuracy                           0.96      1092
   macro avg       0.96      0.96      0.96      1092
weighted avg       0.96      0.96      0.96      1092


Train_Set
[[527   1   5   8   4   3   5   1]
 [  3 413   2

# ------------------------------

In [None]:
predict = model.predict(test)

In [9]:
data = {
    'id': range(2340),
    'Öbek İsmi': predict
}
sub_df = pd.DataFrame(data)
sub_df.set_index('id', inplace=True)
sub_df

Unnamed: 0_level_0,Öbek İsmi
id,Unnamed: 1_level_1
0,2
1,2
2,1
3,5
4,0
...,...
2335,7
2336,1
2337,6
2338,4


In [10]:
sub_df['Öbek İsmi'] = sub_df['Öbek İsmi'].map({0: 'obek_1', 1:'obek_2', 2:'obek_3', 3:'obek_4', 4:'obek_5', 5:'obek_6', 6:'obek_7', 7:'obek_8'})

In [11]:
sub_df

Unnamed: 0_level_0,Öbek İsmi
id,Unnamed: 1_level_1
0,obek_3
1,obek_3
2,obek_2
3,obek_6
4,obek_1
...,...
2335,obek_8
2336,obek_2
2337,obek_7
2338,obek_5


In [12]:
sub_df.to_csv('submission2.csv')