In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold, cross_validate, \
    cross_val_score

from sklearn.pipeline import make_pipeline, Pipeline

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from sklearn import set_config

In [2]:
set_config(display="diagram")

In [3]:
columns = ['MOSTYPE' , 'MAANTHUI' , 'MGEMOMV' , 'MGEMLEEF' , 'MOSHOOFD' , 'MGODRK' , 'MGODPR' , 'MGODOV' , 'MGODGE' , 'MRELGE' , 'MRELSA' , 'MRELOV' , 'MFALLEEN' , 'MFGEKIND' , 'MFWEKIND' , 'MOPLHOOG' , 'MOPLMIDD' , 'MOPLLAAG' , 'MBERHOOG' , 'MBERZELF' , 'MBERBOER' , 'MBERMIDD' , 'MBERARBG' , 'MBERARBO' , 'MSKA' , 'MSKB1' , 'MSKB2' , 'MSKC' , 'MSKD' , 'MHHUUR' , 'MHKOOP' , 'MAUT1' , 'MAUT2' , 'MAUT0' , 'MZFONDS' , 'MZPART' , 'MINKM30' , 'MINK3045' , 'MINK4575' , 'MINK7512' , 'MINK123M' , 'MINKGEM' , 'MKOOPKLA' , 'PWAPART' , 'PWABEDR' , 'PWALAND' , 'PPERSAUT' , 'PBESAUT' , 'PMOTSCO' , 'PVRAAUT' , 'PAANHANG' , 'PTRACTOR' , 'PWERKT' , 'PBROM' , 'PLEVEN' , 'PPERSONG' , 'PGEZONG' , 'PWAOREG' , 'PBRAND' , 'PZEILPL' , 'PPLEZIER' , 'PFIETS' , 'PINBOED' , 'PBYSTAND' , 'AWAPART' , 'AWABEDR' , 'AWALAND' , 'APERSAUT' , 'ABESAUT' , 'AMOTSCO' , 'AVRAAUT' , 'AAANHANG' , 'ATRACTOR' , 'AWERKT' , 'ABROM' , 'ALEVEN' , 'APERSONG' , 'AGEZONG:' , 'AWAOREG' , 'ABRAND' , 'AZEILPL' , 'APLEZIER' , 'AFIETS' , 'AINBOED' , 'ABYSTAND' , 'CARAVAN']

df = pd.read_csv('data/ticdata2000.txt', names= columns, sep='\t')
# test_data = pd.read_csv('/content/drive/MyDrive/Egyetem, oktatás/AI DM/Competition2/ticeval2000.txt', names= columns[:-1], sep='\t')

In [4]:
print(f'df len: {len(df)}\nn_cols: {len(df.columns)}')

df len: 5822
n_cols: 86


In [5]:
# df.hist(figsize=(20, 15))

In [6]:
[print(f'{col}: {sorted(pd.unique(df[col]))}') for col in df.columns]

MOSTYPE: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]
MAANTHUI: [1, 2, 3, 4, 5, 6, 7, 8, 10]
MGEMOMV: [1, 2, 3, 4, 5]
MGEMLEEF: [1, 2, 3, 4, 5, 6]
MOSHOOFD: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
MGODRK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MGODPR: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MGODOV: [0, 1, 2, 3, 4, 5]
MGODGE: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MRELGE: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MRELSA: [0, 1, 2, 3, 4, 5, 6, 7]
MRELOV: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MFALLEEN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MFGEKIND: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MFWEKIND: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MOPLHOOG: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MOPLMIDD: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MOPLLAAG: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MBERHOOG: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MBERZELF: [0, 1, 2, 3, 4, 5]
MBERBOER: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MBERMIDD: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MBERARBG: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
MBERARBO: 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [7]:
# feature engineering
perc_cols = [col for col in df. columns if (col[0] == 'M') and (col not in ['MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD'])]

In [8]:
for col in perc_cols:
    df[f'ALL_{col}'] = (df[col] == 9) * 1
    df[f'NONE_{col}'] = (df[col] == 0) * 1

In [9]:
insu_cols = [col for col in df. columns if (col[0] != 'M') if len(pd.unique(df[col])) > 3]

In [10]:
for col in insu_cols:
    df[f'HASALOT_{col}'] = (df[col] > 2) * 1

  df[f'HASALOT_{col}'] = (df[col] > 2) * 1
  df[f'HASALOT_{col}'] = (df[col] > 2) * 1
  df[f'HASALOT_{col}'] = (df[col] > 2) * 1
  df[f'HASALOT_{col}'] = (df[col] > 2) * 1
  df[f'HASALOT_{col}'] = (df[col] > 2) * 1


In [11]:
onehot = pd.get_dummies(df[['MOSTYPE', 'MOSHOOFD']]).astype('object')

In [12]:
df = pd.concat([df.drop(['MOSTYPE', 'MOSHOOFD'], axis=1), onehot], axis=1)

In [13]:
df

Unnamed: 0,MAANTHUI,MGEMOMV,MGEMLEEF,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,...,HASALOT_AMOTSCO,HASALOT_AVRAAUT,HASALOT_AAANHANG,HASALOT_ATRACTOR,HASALOT_AWERKT,HASALOT_ALEVEN,HASALOT_ABRAND,HASALOT_AFIETS,MOSTYPE,MOSHOOFD
0,1,3,2,0,5,1,3,7,0,2,...,0,0,0,0,0,0,0,0,33,8
1,1,2,2,1,4,1,4,6,2,2,...,0,0,0,0,0,0,0,0,37,8
2,1,2,2,0,4,2,4,3,2,4,...,0,0,0,0,0,0,0,0,37,8
3,1,3,3,2,3,2,4,5,2,2,...,0,0,0,0,0,0,0,0,9,3
4,1,4,2,1,4,1,4,7,1,2,...,0,0,0,0,0,0,0,0,40,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5817,1,1,2,0,6,1,2,1,2,6,...,0,0,0,0,0,0,0,0,36,8
5818,1,4,4,1,4,1,4,6,0,3,...,0,0,0,0,0,0,0,0,35,8
5819,1,3,4,0,6,0,3,5,1,4,...,0,0,0,0,0,0,0,0,33,8
5820,1,3,2,0,7,0,2,7,2,0,...,0,0,0,0,0,0,0,0,34,8


## MODELING

In [14]:
y_train = df['CARAVAN'].values
X_train = df.drop(['CARAVAN'], axis=1).values

In [15]:
X_train.shape, y_train.shape

((5822, 189), (5822,))

In [16]:
pipe = imbPipeline([
    ('sampler', RandomUnderSampler()),
    ('rf', RandomForestClassifier())
])

In [17]:
pipe.get_params()

{'memory': None,
 'steps': [('sampler', RandomUnderSampler()),
  ('rf', RandomForestClassifier())],
 'verbose': False,
 'sampler': RandomUnderSampler(),
 'rf': RandomForestClassifier(),
 'sampler__random_state': None,
 'sampler__replacement': False,
 'sampler__sampling_strategy': 'auto',
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 100,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False}

In [18]:
pipe.fit(X_train, y_train)

In [19]:
# kf = RepeatedStratifiedKFold(n_splits=5, random_state=999)
# scorer = make_scorer(balanced_accuracy_score)
# scores = np.mean(cross_val_score(pipe, X_train, y_train, scoring=scorer, cv=kf))

In [20]:
# print(scores)

### CCP

In [21]:
params_ccp = {
    'sampler': [SMOTETomek(), RandomOverSampler()],
    'rf__ccp_alpha': list(np.arange(0, .1, .002)),
    'rf__class_weight':['balanced']
}

In [22]:
rsf = RepeatedStratifiedKFold(random_state=42, n_repeats=5, n_splits=5)
clf_ccp = GridSearchCV(pipe, params_ccp, scoring='balanced_accuracy', verbose=2, cv=rsf, n_jobs=-1)

In [None]:
clf_ccp.fit(X_train, y_train)

Fitting 25 folds for each of 100 candidates, totalling 2500 fits


### ALL

In [None]:
params = {
    'sampler': [RandomOverSampler(), SMOTETomek()],
    'rf__max_features': list(np.arange(100, 150, 10)) + [len(df.columns)],
    'rf__max_leaf_nodes': list(np.arange(50, 210, 20)),
    'rf__n_estimators': [50, 75, 100, 150, 200, 400],
    'rf__class_weight':['balanced']
}

In [None]:
rsf = RepeatedStratifiedKFold(random_state=42, n_repeats=5, n_splits=5)
clf = GridSearchCV(pipe, params, scoring='balanced_accuracy', verbose=2, cv=rsf, n_jobs=-1)

In [None]:
clf.fit(X_train, y_train)

In [None]:
result_df = pd.DataFrame.from_dict(clf.cv_results_, orient='columns')
print(result_df.columns)

In [None]:
sns.relplot(data=result_df,
            kind='line',
            x='param_rf__max_depth',
            y='mean_test_score',
            hue='param_sampler',
            col='param_rf__min_samples_leaf')
plt.show()

In [None]:
clf.best_params_

In [None]:
pipe_best = imbPipeline([
    ('sampler', RandomOverSampler()),
    ('rf', RandomForestClassifier(min_samples_split=5,
                                  min_samples_leaf=4,
                                  max_depth=10))
])



In [None]:
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=999)
scorer = make_scorer(balanced_accuracy_score)
scores = np.mean(cross_val_score(pipe_best, X_train, y_train, scoring=scorer, cv=kf))

In [None]:
print(scores)