In [1]:
import re
from pprint import pprint
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np

from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb

In [2]:
method = "target"
assert method in ["label", "target", "onehot", "catboost"]

features = pd.read_csv(f"../data/processed/{method}/train.csv")
test = pd.read_csv(f"../data/processed/{method}/test.csv")

targets = features["label"].values

features.drop(["label"], inplace=True, axis=1)
print(features.shape, test.shape)

(80176, 20) (34365, 20)


### Feature selection

In [3]:
feature_selection = False

if feature_selection:
    tree = ExtraTreesClassifier(n_estimators=50)
    tree = tree.fit(features, targets)

    selector = SelectFromModel(tree, prefit=True)
    selected_features = features.columns[selector.get_support()]

    features = features[selected_features]
    test = test[selected_features]

    print(features.shape, test.shape)

### Gridsearch with Random Forest model

In [6]:
parameter_grid_random_forest = {
    "class_weight": ["balanced", None],
    'max_depth': np.arange(3, 35, 2),
    'n_estimators': np.arange(100, 300, 25),
    'max_features': ['sqrt', 'auto', 'log2'],
    'min_samples_split': np.arange(5, 30, 2),
    'min_samples_leaf': np.arange(5, 30, 2),
    'bootstrap': [True, False],
    'max_samples': [0.7, 0.8, 0.9]
}

model = RandomForestClassifier(n_jobs=11)
cross_validation = StratifiedKFold(n_splits=5)

grid_search = RandomizedSearchCV(model, 
                                 n_jobs=10,
                                 scoring='f1_micro',
                                 param_distributions=parameter_grid_random_forest,
                                 cv=cross_validation,
                                 verbose=5,
                                 n_iter=20)

grid_search.fit(features, targets)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:  1.2min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  2.5min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_sp...
                     

In [9]:
print(f"best_score: {grid_search.best_score_}")

pprint(f"{grid_search.best_params_}")

best_score: 0.5889917478314889
("{'n_estimators': 125, 'min_samples_split': 13, 'min_samples_leaf': 5, "
 "'max_samples': 0.9, 'max_features': 'log2', 'max_depth': 29, 'class_weight': "
 "'balanced', 'bootstrap': True}")


In [10]:
grid_search.best_params_

{'n_estimators': 125,
 'min_samples_split': 13,
 'min_samples_leaf': 5,
 'max_samples': 0.9,
 'max_features': 'log2',
 'max_depth': 29,
 'class_weight': 'balanced',
 'bootstrap': True}

In [11]:
model = RandomForestClassifier(**grid_search.best_params_)

model.fit(features, targets)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=29, max_features='log2',
                       max_leaf_nodes=None, max_samples=0.9,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=13,
                       min_weight_fraction_leaf=0.0, n_estimators=125,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Submission 

In [12]:
submission = pd.DataFrame()
submission["Id"] = range(len(test))
submission["label"] = model.predict(test)
submission.to_csv(f"../data/submissions/rf_{method}.csv", index=False)

In [14]:
submission.label.value_counts()

0    10893
3     7022
2     6420
1     5503
7     3826
4      410
5      149
6      142
Name: label, dtype: int64

### Save proba

In [15]:
probas = pd.DataFrame()
probas["Id"] = range(len(test))

probas = pd.concat([probas, pd.DataFrame(model.predict_proba(test))], axis=1)

In [17]:
submission

Unnamed: 0,Id,label
0,0,1
1,1,0
2,2,3
3,3,2
4,4,0
...,...,...
34360,34360,7
34361,34361,2
34362,34362,3
34363,34363,0


In [24]:
probas[range(8)].sum(axis=1)

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
34360    1.0
34361    1.0
34362    1.0
34363    1.0
34364    1.0
Length: 34365, dtype: float64

In [25]:
probas.to_csv("../data/submissions/rf_probas.csv", index=False)

# private leaderboard : 0.59899