In [1]:
import re
from pprint import pprint
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np

from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [2]:
method = "target"
assert method in ["label", "target", "onehot", "catboost"]

features = pd.read_csv(f"../data/processed/{method}/train.csv")
test = pd.read_csv(f"../data/processed/{method}/test.csv")

targets = features["label"].values

features.drop(["label"], inplace=True, axis=1)
print(features.shape, test.shape)

(80176, 20) (34365, 20)


### Feature selection

In [3]:
feature_selection = False

if feature_selection:
    tree = ExtraTreesClassifier(n_estimators=50)
    tree = tree.fit(features, targets)

    selector = SelectFromModel(tree, prefit=True)
    selected_features = features.columns[selector.get_support()]

    features = features[selected_features]
    test = test[selected_features]

    print(features.shape, test.shape)

In [4]:
def objective(space):
    classifier = RandomForestClassifier(class_weight=space["class_weight"],
                                        n_estimators=int(space['n_estimators']),
                                        max_depth=int(space['max_depth']),
                                        max_features=space["max_features"],
                                        min_samples_split=int(space["min_samples_split"]),
                                        min_samples_leaf=int(space["min_samples_leaf"]),
                                        bootstrap=space["bootstrap"],
                                        max_samples=space["max_samples"], 
                                        n_jobs=10)
    #classifier.fit(features, targets)    
    scores = cross_val_score(estimator=classifier,
                             n_jobs=2,
                             X=features, 
                             y=targets, 
                             cv=StratifiedKFold(n_splits=5),
                             scoring="f1_micro"
                            )
    mean_score = scores.mean()
    
    return {
        'loss': 1-mean_score, 
        'status': STATUS_OK 
    }

In [50]:
space = {
    'n_estimators': hp.quniform('n_estimators', 250, 350, 10),
    'class_weight': hp.choice("class_weight", ["balanced", None]),
    'max_depth' : hp.quniform('max_depth', 20, 40, 1),
    'max_features': hp.choice("max_features", ["sqrt", "auto", "log2"]),
    "min_samples_split": hp.quniform("min_samples_split", 2, 30, 2),
    'min_samples_leaf' : hp.quniform("min_samples_leaf", 2, 30, 2),
    'bootstrap': hp.choice("bootstrap", [True, False]),
    'max_samples': hp.quniform("max_samples", 0.5, 0.9, 0.1)
}

trials = Trials()
best = fmin(fn=objective, 
            verbose=5,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

 42%|████▏     | 21/50 [05:37<08:23, 17.38s/trial, best loss: 0.40423566308515535]




100%|██████████| 50/50 [15:08<00:00, 18.17s/trial, best loss: 0.40423566308515535]


In [51]:
best["class_weight"] = ["balanced", None][best["class_weight"]]
best["bootstrap"] = [True, False][best["bootstrap"]]
best["max_features"] = ["sqrt", "auto", "log2"][best["max_features"]]
best["n_estimators"] = int(best["n_estimators"])
best["min_samples_split"] = int(best["min_samples_split"])
best["min_samples_leaf"] = int(best["min_samples_leaf"])
best["max_depth"] = int(best["max_depth"])

In [52]:
print(f"best_params :")
pprint(best)

best_params :
{'bootstrap': False,
 'class_weight': 'balanced',
 'max_depth': 21,
 'max_features': 'auto',
 'max_samples': 0.5,
 'min_samples_leaf': 6,
 'min_samples_split': 22,
 'n_estimators': 310}


### Cross val score

In [54]:
model = RandomForestClassifier(**best)
scores = cross_val_score(model, features, targets, scoring="f1_micro", verbose=2, n_jobs=5)
print(f"mean score: {np.mean(scores)}")

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.


mean score: 0.595664566854294


[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   30.8s remaining:   46.1s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   30.8s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   30.8s finished


### Fit best model 

In [55]:
model = RandomForestClassifier(**best)
model.fit(features, targets)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=21, max_features='auto',
                       max_leaf_nodes=None, max_samples=0.5,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=22,
                       min_weight_fraction_leaf=0.0, n_estimators=310,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Submission 

In [56]:
submission = pd.DataFrame()
submission["Id"] = range(len(test))
submission["label"] = model.predict(test)
submission.to_csv(f"../data/submissions/rf_{method}.csv", index=False)

### Save proba

In [57]:
probas = pd.DataFrame()
probas["Id"] = range(len(test))

probas = pd.concat([probas, pd.DataFrame(model.predict_proba(test))], axis=1)

In [58]:
probas.to_csv(f"../data/submissions/rf_probas_{method}.csv", index=False)

# private leaderboard : 0.59928