In [7]:
#Here you must specify your own path
root = ''

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

In [9]:
# Dic to get the respective target values of the numbers.
target_dic = {
    0: '(0, 20]constant',
    1: '(20, 40]constant',
    2: '(40, 60]constant',
    3: '(60, 80]constant',
    4: '(80, 100]constant',
    5: '(0, 20]intermittent',
    6: '(20, 40]intermittent',
    7: '(40, 60]intermittent',
    8: '(60, 80]intermittent',
    9: '(80, 100]intermittent',
}

In [10]:
#read in csv file into dataframe
df_train = pd.read_csv(root + 'data/02_processed/train.csv', sep=';')
df_train.pop('Unnamed: 0')

df2=df_train.copy()
df2.drop(['target','4','66'], axis = 1,inplace=True)


#read validation and test data from csv
df_validation = pd.read_csv(root + 'data/02_processed/validation.csv', sep=';')
df_validation.set_index('user_id',inplace=True)
df_test = pd.read_csv(root + 'data/02_processed/test.csv', sep=';')
df_test.set_index('user_id',inplace=True)

#encode target column of test data
target_test=pd.get_dummies(df_test['target'], prefix='target')

# target of train data without encoding
target_train= df_train['target']

#delete target from training data
df_train.pop('target')

# target of train data without encoding
target_test= df_test['target']

#delete target from training data
df_test.pop('target')

train=df_train.copy()
test= df_test.copy()

In [11]:
#Get the categorical and numerical column names
categorical_columns=[]
numerical_columns=[]
for name, values in train.iteritems():
    if values.dtype==np.float64:
        numerical_columns.append(name)
    if values.dtype==object:
        categorical_columns.append(name)

numerical_columns  

['66', '67', '4']

In [12]:
#Encoder to encode categorical variables into integer
categorical_encoder = OrdinalEncoder()

#ColumnTransformer encodes categorical data with the categorical_encoder.
#and numeric columns are not edited.
preprocessing = ColumnTransformer(
    [('cat', categorical_encoder, categorical_columns),
     ('num', 'passthrough', numerical_columns)])


scorer = make_scorer(f1_score,average='macro')

#Classifier
GBM = GradientBoostingClassifier(random_state=0)

#This grid contains values which are tested by the Randomized search 
#to get the best Combinations of them, measured with the  f1-score

param_grid = [
    {'learning_rate': [0.15,0.2],
     'n_estimators':[100,200],
     'min_samples_split':[10,20], 
     'min_samples_leaf':[6,8], 
     'max_depth':[4,6], 
     'max_features':['sqrt'], 
     'max_leaf_nodes':[6,8], 
     'subsample':[0.75,1], 
    }
]

#Created Pipeline to combine preprocessing of the data and the gridsearch.
model = Pipeline([
    ('preprocess', preprocessing),
    ('gridsearch',GridSearchCV(gbm,param_grid,cv=5,scoring=scorer,n_jobs=4,verbose=3,return_train_score=True))
])

model.fit(train, target_train)
model.named_steps['gridsearch'].best_params_

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   51.6s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed: 10.9min
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed: 23.5min
[Parallel(n_jobs=4)]: Done 640 out of 640 | elapsed: 28.5min finished


{'learning_rate': 0.2,
 'max_depth': 6,
 'max_features': 'sqrt',
 'max_leaf_nodes': 8,
 'min_samples_leaf': 8,
 'min_samples_split': 20,
 'n_estimators': 200,
 'subsample': 1}