#### Imports

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#### Read Data and Preprocessing

In [25]:
train_data = pd.read_csv('train.csv')
train_data = train_data.drop(columns=['Unnamed: 0'])
train_data.head()

Unnamed: 0,S2_B2_jan,S2_B3_jan,S2_B4_jan,S2_B5_jan,S2_B6_jan,S2_B7_jan,S2_B8_jan,S2_B8A_jan,S2_B9_jan,S2_B11_jan,...,S2_B11_dec,S2_B12_dec,S1_VV_dec,S1_VH_dec,ERA5_temperature_2m_dec,ERA5_total_precipitation_dec,topo_elevation_dec,topo_slope_dec,NDVI_dec,LABELS
0,-14.271277,-21.134172,957.531174,1019.557045,1230.022834,1435.138891,1693.70227,1805.411109,1891.557355,2020.567257,...,2203.025469,795.862978,2747.777894,1688.997611,289.38222,0.000147,1886.384195,0.749163,0.307887,0.0
1,-12.341429,-14.744978,949.227883,895.845779,987.517322,1350.496916,1930.239804,2048.686546,2375.680146,2259.90329,...,2922.382762,1140.79618,2528.590979,1934.261859,266.57837,0.002876,69.876216,1.385904,0.097779,1.0
2,-12.317847,-17.026201,1087.616069,1086.401035,1185.755955,1529.674085,1858.981635,1904.667487,2082.954737,2115.713139,...,2548.726966,521.102434,1556.082048,741.212901,276.467873,0.001622,91.279743,2.125908,0.418492,1.0
3,-15.332478,-20.978203,2987.224569,3163.553102,3412.572434,3908.443371,4044.232162,4231.35377,3682.709329,4518.882323,...,2537.545246,1073.588565,2388.141112,1367.682434,276.101799,0.00372,767.817294,8.427714,0.336528,1.0
4,-11.739502,-11.374262,691.825857,782.111227,478.448214,976.753128,3189.134129,3813.893119,3861.824527,4174.591554,...,4276.302643,371.861994,2080.641023,692.730434,298.663246,0.001763,61.500923,1.203899,0.595404,1.0


In [26]:
train_data['LABELS'].dtype

dtype('float64')

In [27]:
X = train_data.drop(columns=['LABELS'])
y = train_data['LABELS']

In [28]:
#Normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

#### Grid Search with cross validation

In [29]:

estimator = [int(x) for x in range(100,901,100)]
max_features = ['auto', 'log2']
max_depth = [1, 5, 7, 10, 15, 20]
criteria = ['gini', 'entropy']
class_weight = [{0.0: 1.7, 1.0: 1}]
jobs = [-1]

param_grid = {
    'n_estimators': estimator,
    'max_features':max_features,
    'max_depth':max_depth,
    'class_weight':class_weight,
    'n_jobs':jobs,
    'criterion': criteria,
    'random_state':[42]
}
#grid search with above param grid and cross validation param as 3
search = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, cv=3)

In [30]:
search.fit(X, y)

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': [{0.0: 1.7, 1.0: 1}],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 5, 7, 10, 15, 20],
                         'max_features': ['auto', 'log2'],
                         'n_estimators': [100, 200, 300, 400, 500, 600, 700,
                                          800, 900],
                         'n_jobs': [-1], 'random_state': [42]})

In [31]:
#Best Parameters
search.best_params_

{'class_weight': {0.0: 1.7, 1.0: 1},
 'criterion': 'entropy',
 'max_depth': 20,
 'max_features': 'auto',
 'n_estimators': 800,
 'n_jobs': -1,
 'random_state': 42}

In [53]:
model = RandomForestClassifier(n_estimators=800, n_jobs=-1, random_state=42, class_weight={0.0: 1.7, 1.0: 1}, criterion='entropy', max_depth=20, max_features='auto')

#### F1 Score and Crossvalidation of above obtained model 

In [33]:
#Fit model
model.fit(X,y)

RandomForestClassifier(class_weight={0.0: 1.7, 1.0: 1}, criterion='entropy',
                       max_depth=20, n_estimators=800, n_jobs=-1,
                       random_state=42)

In [54]:
#Extra imports for f1 checking
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, make_scorer

In [48]:
#split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [50]:
model.fit(X_train,y_train)
res_val = model.predict(X_test)

In [52]:
#F1 of the train test split
print(f1_score(y_test, res_val))

0.8891498356035696


In [55]:
#f1 based on cross validation using f1 based custom scorer
scor = make_scorer(f1_score, average='weighted')
cross_val_score(model, X, y, cv=10, scoring=scor)

array([0.83597362, 0.84349254, 0.8385305 , 0.84566413, 0.85468797,
       0.84868619, 0.83854123, 0.84525359, 0.84632737, 0.84395585])