In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime

In [2]:
train = pd.read_csv('data/train_dropped_290_without_nans.csv')
test = pd.read_csv('data/test_dropped_290_without_nans.csv')
y_train=pd.read_csv('data/y_train.csv')['Culture']
print(train.shape, test.shape)

(2838, 293) (939, 293)


In [3]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [4]:
xgb = XGBClassifier(learning_rate=0.01, n_estimators=800, nthread=-1)

In [6]:
%%time
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 0)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='f1_weighted', n_jobs=-1, cv=skf.split(train,y_train), verbose=3, random_state=42 )

# Here we go # timing starts from this point for "start_time" variable
random_search.fit(train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed: 12.1min remaining: 18.2min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 16.5min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 17.3min finished


Wall time: 20min 32s


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x000001DDDB685EC8>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=0.01,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, mis...
                                           random_state=None, reg_alpha=None,
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=None, tr

In [8]:
preds = random_search.predict(test)
pd.Series(preds).to_csv('preds/submission-random-grid-search-xgb-01.csv', index=False, header=['Culture'])

In [5]:
train.drop(['0','1','2'],axis=1,inplace=True)
test.drop(['0','1','2'],axis=1,inplace=True)

In [6]:
%%time
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='f1_weighted', n_jobs=-1, cv=skf.split(train,y_train), verbose=3, random_state=42 )

# Here we go # timing starts from this point for "start_time" variable
random_search.fit(train,y_train)
preds = random_search.predict(test)
pd.Series(preds).to_csv('preds/submission-random-grid-search-xgb-290.csv', index=False, header=['Culture'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed: 32.5min remaining: 10.2min
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 35.0min finished


Wall time: 36min 34s


In [7]:
random_search.best_params_

{'subsample': 0.8,
 'min_child_weight': 10,
 'max_depth': 4,
 'gamma': 1.5,
 'colsample_bytree': 0.6}

In [8]:
%%time
params = {
        'min_child_weight': [10],
        'gamma': [1.5],
        'subsample': [0.8],
        'colsample_bytree': [0.6],
        'max_depth': [6]
        }
xgb = XGBClassifier(learning_rate=0.01, n_estimators=1200, nthread=-1)
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 256)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='f1_weighted', n_jobs=-1, cv=skf.split(train,y_train), verbose=3, random_state=42 )

# Here we go # timing starts from this point for "start_time" variable
random_search.fit(train,y_train)
preds = random_search.predict(test)
pd.Series(preds).to_csv('preds/submission-random-grid-search-xgb-290.csv', index=False, header=['Culture'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  9.0min remaining: 13.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.1min finished


Wall time: 11min 26s
