#### Import and preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
seed = 17
np.random.seed(seed)
random.seed(seed)
# torch.manual_seed(17)

In [5]:
#Reading data and removing extra column being loaded
data = pd.read_csv('train.csv')
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,S2_B2_jan,S2_B3_jan,S2_B4_jan,S2_B5_jan,S2_B6_jan,S2_B7_jan,S2_B8_jan,S2_B8A_jan,S2_B9_jan,S2_B11_jan,...,S2_B11_dec,S2_B12_dec,S1_VV_dec,S1_VH_dec,ERA5_temperature_2m_dec,ERA5_total_precipitation_dec,topo_elevation_dec,topo_slope_dec,NDVI_dec,LABELS
0,-14.271277,-21.134172,957.531174,1019.557045,1230.022834,1435.138891,1693.70227,1805.411109,1891.557355,2020.567257,...,2203.025469,795.862978,2747.777894,1688.997611,289.38222,0.000147,1886.384195,0.749163,0.307887,0.0
1,-12.341429,-14.744978,949.227883,895.845779,987.517322,1350.496916,1930.239804,2048.686546,2375.680146,2259.90329,...,2922.382762,1140.79618,2528.590979,1934.261859,266.57837,0.002876,69.876216,1.385904,0.097779,1.0
2,-12.317847,-17.026201,1087.616069,1086.401035,1185.755955,1529.674085,1858.981635,1904.667487,2082.954737,2115.713139,...,2548.726966,521.102434,1556.082048,741.212901,276.467873,0.001622,91.279743,2.125908,0.418492,1.0
3,-15.332478,-20.978203,2987.224569,3163.553102,3412.572434,3908.443371,4044.232162,4231.35377,3682.709329,4518.882323,...,2537.545246,1073.588565,2388.141112,1367.682434,276.101799,0.00372,767.817294,8.427714,0.336528,1.0
4,-11.739502,-11.374262,691.825857,782.111227,478.448214,976.753128,3189.134129,3813.893119,3861.824527,4174.591554,...,4276.302643,371.861994,2080.641023,692.730434,298.663246,0.001763,61.500923,1.203899,0.595404,1.0


In [6]:
#Test data
test_data = pd.read_csv('test_nolabels.csv')

In [7]:
#Column List
cols = list(data.columns)
cols.remove('LABELS')

In [8]:
#Normalization
scaler = StandardScaler()
data[cols] = scaler.fit_transform(data[cols])
test_data[cols] = scaler.transform(test_data[cols])

In [9]:
data.head()

Unnamed: 0,S2_B2_jan,S2_B3_jan,S2_B4_jan,S2_B5_jan,S2_B6_jan,S2_B7_jan,S2_B8_jan,S2_B8A_jan,S2_B9_jan,S2_B11_jan,...,S2_B11_dec,S2_B12_dec,S1_VV_dec,S1_VH_dec,ERA5_temperature_2m_dec,ERA5_total_precipitation_dec,topo_elevation_dec,topo_slope_dec,NDVI_dec,LABELS
0,-0.600362,-0.669307,-0.564433,-0.542568,-0.49374,-0.507162,-0.660625,-0.699683,-0.685866,-0.684498,...,-0.604803,-0.395735,0.5941,0.491349,0.581619,-0.850329,2.205969,-0.630313,0.207548,0.0
1,-0.145401,0.682995,-0.568777,-0.607338,-0.611189,-0.543737,-0.538967,-0.571041,-0.43089,-0.556113,...,-0.219623,-0.107189,0.403737,0.786779,-1.064377,-0.101889,-0.771218,-0.535247,-0.845008,1.0
2,-0.139841,0.200164,-0.496376,-0.507571,-0.515179,-0.466312,-0.575617,-0.647197,-0.585062,-0.63346,...,-0.419697,-0.625579,-0.440885,-0.650293,-0.350547,-0.445915,-0.736138,-0.424764,0.761636,1.0
3,-0.85054,-0.636296,0.497456,0.579941,0.563298,0.561589,0.548314,0.583136,0.25749,0.655653,...,-0.425684,-0.16341,0.281757,0.104313,-0.37697,0.129556,0.372681,0.516099,0.35103,1.0
4,-0.003496,1.396423,-0.703444,-0.666884,-0.857738,-0.705238,0.108515,0.362386,0.351826,0.470968,...,0.505335,-0.750423,0.014694,-0.708692,1.251529,-0.407222,-0.784945,-0.562421,1.647893,1.0


In [10]:
target = 'LABELS'

In [11]:
#Generates the output but have to delete the autogenerated index column, manually from file before submitting
def generate_result(test_res, filename="Result.csv"):
    result_dataframe = pd.DataFrame(test_data['S.No'])
    result_dataframe['LABELS'] = test_res
    result_dataframe.to_csv(filename)

In [12]:
#X and y creation
X = data.drop(columns=[target])
y = data[target]

#### SVM based NuSVC

In [13]:
from sklearn.svm import NuSVC

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)
svm_model = NuSVC(random_state=42, nu=0.3, break_ties=True)

In [14]:
svm_model.fit(X_train,y_train)
result = svm_model.predict(X_test)
print(f1_score(y_test,result))

0.8786834600760457


In [14]:
# Train on complete after hyperparms tuned
svm_model.fit(X,y)

NuSVC(break_ties=True, nu=0.3, random_state=42)

In [15]:
test_results = svm_model.predict(test_data[cols])

In [17]:
#CHeck unique values
np.unique(test_results, return_counts=True)

(array([0., 1.]), array([508, 692], dtype=int64))

In [16]:
generate_result(test_results, "Svm_nusvc.csv")

#### cross val score for different models

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [82]:
clf_rf = RandomForestClassifier(n_estimators=200, random_state=seed, max_depth=7)
score = cross_val_score(clf_rf, X, y, cv=10, n_jobs=-1)

In [83]:
np.mean(score) 

0.7980322580645162

In [85]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier

In [86]:
clf_grbc = GradientBoostingClassifier(n_estimators=200, random_state=seed, max_depth=7)
clf_xgb = xgb.XGBClassifier(n_estimators=200, random_state=seed, max_depth=7)
clf_lgbm = lgb.LGBMClassifier(n_estimators=200, random_state=seed, max_depth=7)
clf_ada = AdaBoostClassifier(n_estimators=200, random_state=seed)

In [87]:
score_grbc = cross_val_score(clf_grbc, X, y, cv=10, n_jobs=-1)
score_xgb = cross_val_score(clf_xgb, X, y, cv=10, n_jobs=-1)
score_lgbm = cross_val_score(clf_lgbm, X, y, cv=10, n_jobs=-1)
score_ada = cross_val_score(clf_ada, X, y, cv=10, n_jobs=-1)

In [88]:
mean cross val score for the above params
print(np.mean(score_grbc),np.mean(score_xgb),np.mean(score_lgbm), np.mean(score_ada))

0.8460161290322581 0.8447096774193549 0.8418064516129032 0.7909677419354839


#### Catboost based approach

In [13]:
import catboost
from catboost import *

In [14]:
X = data[cols]
X.head()

Unnamed: 0,S2_B2_jan,S2_B3_jan,S2_B4_jan,S2_B5_jan,S2_B6_jan,S2_B7_jan,S2_B8_jan,S2_B8A_jan,S2_B9_jan,S2_B11_jan,...,S2_B9_dec,S2_B11_dec,S2_B12_dec,S1_VV_dec,S1_VH_dec,ERA5_temperature_2m_dec,ERA5_total_precipitation_dec,topo_elevation_dec,topo_slope_dec,NDVI_dec
0,-0.600362,-0.669307,-0.564433,-0.542568,-0.49374,-0.507162,-0.660625,-0.699683,-0.685866,-0.684498,...,-0.616713,-0.604803,-0.395735,0.5941,0.491349,0.581619,-0.850329,2.205969,-0.630313,0.207548
1,-0.145401,0.682995,-0.568777,-0.607338,-0.611189,-0.543737,-0.538967,-0.571041,-0.43089,-0.556113,...,-0.169216,-0.219623,-0.107189,0.403737,0.786779,-1.064377,-0.101889,-0.771218,-0.535247,-0.845008
2,-0.139841,0.200164,-0.496376,-0.507571,-0.515179,-0.466312,-0.575617,-0.647197,-0.585062,-0.63346,...,-0.368727,-0.419697,-0.625579,-0.440885,-0.650293,-0.350547,-0.445915,-0.736138,-0.424764,0.761636
3,-0.85054,-0.636296,0.497456,0.579941,0.563298,0.561589,0.548314,0.583136,0.25749,0.655653,...,-0.466697,-0.425684,-0.16341,0.281757,0.104313,-0.37697,0.129556,0.372681,0.516099,0.35103
4,-0.003496,1.396423,-0.703444,-0.666884,-0.857738,-0.705238,0.108515,0.362386,0.351826,0.470968,...,0.215009,0.505335,-0.750423,0.014694,-0.708692,1.251529,-0.407222,-0.784945,-0.562421,1.647893


In [15]:
y = data['LABELS']
y.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: LABELS, dtype: float64

In [19]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [80]:
model = CatBoostClassifier( learning_rate=0.21, loss_function='Logloss', random_seed=42,eval_metric='F1')
#gets 0.96867

In [81]:
model.fit(X_train,y_train)

0:	learn: 0.8145260	total: 31.6ms	remaining: 31.6s
1:	learn: 0.8344833	total: 61.9ms	remaining: 30.9s
2:	learn: 0.8384542	total: 92.1ms	remaining: 30.6s
3:	learn: 0.8361622	total: 124ms	remaining: 30.9s
4:	learn: 0.8396754	total: 156ms	remaining: 31.1s
5:	learn: 0.8418939	total: 187ms	remaining: 31s
6:	learn: 0.8442413	total: 220ms	remaining: 31.2s
7:	learn: 0.8459398	total: 255ms	remaining: 31.7s
8:	learn: 0.8472159	total: 289ms	remaining: 31.8s
9:	learn: 0.8495445	total: 320ms	remaining: 31.6s
10:	learn: 0.8504433	total: 354ms	remaining: 31.8s
11:	learn: 0.8512665	total: 386ms	remaining: 31.8s
12:	learn: 0.8522415	total: 419ms	remaining: 31.8s
13:	learn: 0.8540992	total: 451ms	remaining: 31.8s
14:	learn: 0.8557104	total: 487ms	remaining: 31.9s
15:	learn: 0.8569727	total: 519ms	remaining: 31.9s
16:	learn: 0.8571672	total: 550ms	remaining: 31.8s
17:	learn: 0.8580016	total: 583ms	remaining: 31.8s
18:	learn: 0.8584914	total: 616ms	remaining: 31.8s
19:	learn: 0.8597704	total: 649ms	remain

161:	learn: 0.9156968	total: 5.55s	remaining: 28.7s
162:	learn: 0.9158723	total: 5.58s	remaining: 28.7s
163:	learn: 0.9161370	total: 5.61s	remaining: 28.6s
164:	learn: 0.9162997	total: 5.65s	remaining: 28.6s
165:	learn: 0.9167406	total: 5.68s	remaining: 28.6s
166:	learn: 0.9168338	total: 5.71s	remaining: 28.5s
167:	learn: 0.9169566	total: 5.74s	remaining: 28.4s
168:	learn: 0.9172719	total: 5.78s	remaining: 28.4s
169:	learn: 0.9175543	total: 5.82s	remaining: 28.4s
170:	learn: 0.9179862	total: 5.85s	remaining: 28.4s
171:	learn: 0.9183460	total: 5.89s	remaining: 28.3s
172:	learn: 0.9182913	total: 5.92s	remaining: 28.3s
173:	learn: 0.9186108	total: 5.96s	remaining: 28.3s
174:	learn: 0.9188450	total: 5.99s	remaining: 28.3s
175:	learn: 0.9193577	total: 6.03s	remaining: 28.2s
176:	learn: 0.9195680	total: 6.06s	remaining: 28.2s
177:	learn: 0.9199723	total: 6.1s	remaining: 28.2s
178:	learn: 0.9203005	total: 6.13s	remaining: 28.1s
179:	learn: 0.9206169	total: 6.17s	remaining: 28.1s
180:	learn: 0

325:	learn: 0.9483318	total: 11.5s	remaining: 23.8s
326:	learn: 0.9485912	total: 11.5s	remaining: 23.7s
327:	learn: 0.9486512	total: 11.6s	remaining: 23.7s
328:	learn: 0.9490153	total: 11.6s	remaining: 23.7s
329:	learn: 0.9491580	total: 11.6s	remaining: 23.6s
330:	learn: 0.9491658	total: 11.7s	remaining: 23.6s
331:	learn: 0.9491658	total: 11.7s	remaining: 23.6s
332:	learn: 0.9493014	total: 11.8s	remaining: 23.5s
333:	learn: 0.9494593	total: 11.8s	remaining: 23.5s
334:	learn: 0.9495984	total: 11.8s	remaining: 23.5s
335:	learn: 0.9500857	total: 11.9s	remaining: 23.4s
336:	learn: 0.9500179	total: 11.9s	remaining: 23.4s
337:	learn: 0.9503204	total: 11.9s	remaining: 23.4s
338:	learn: 0.9503035	total: 12s	remaining: 23.3s
339:	learn: 0.9503490	total: 12s	remaining: 23.3s
340:	learn: 0.9504445	total: 12s	remaining: 23.3s
341:	learn: 0.9509900	total: 12.1s	remaining: 23.2s
342:	learn: 0.9510052	total: 12.1s	remaining: 23.2s
343:	learn: 0.9511533	total: 12.1s	remaining: 23.1s
344:	learn: 0.9514

487:	learn: 0.9687422	total: 17.3s	remaining: 18.2s
488:	learn: 0.9688659	total: 17.4s	remaining: 18.2s
489:	learn: 0.9687237	total: 17.4s	remaining: 18.1s
490:	learn: 0.9688463	total: 17.5s	remaining: 18.1s
491:	learn: 0.9690116	total: 17.5s	remaining: 18.1s
492:	learn: 0.9692548	total: 17.5s	remaining: 18s
493:	learn: 0.9693720	total: 17.6s	remaining: 18s
494:	learn: 0.9695502	total: 17.6s	remaining: 18s
495:	learn: 0.9695709	total: 17.6s	remaining: 17.9s
496:	learn: 0.9697578	total: 17.7s	remaining: 17.9s
497:	learn: 0.9698133	total: 17.7s	remaining: 17.8s
498:	learn: 0.9700307	total: 17.7s	remaining: 17.8s
499:	learn: 0.9701741	total: 17.8s	remaining: 17.8s
500:	learn: 0.9701209	total: 17.8s	remaining: 17.7s
501:	learn: 0.9702917	total: 17.8s	remaining: 17.7s
502:	learn: 0.9703492	total: 17.9s	remaining: 17.7s
503:	learn: 0.9705091	total: 17.9s	remaining: 17.6s
504:	learn: 0.9706537	total: 18s	remaining: 17.6s
505:	learn: 0.9708515	total: 18s	remaining: 17.6s
506:	learn: 0.9709950	

650:	learn: 0.9814662	total: 23.1s	remaining: 12.4s
651:	learn: 0.9815735	total: 23.2s	remaining: 12.4s
652:	learn: 0.9817691	total: 23.2s	remaining: 12.3s
653:	learn: 0.9817514	total: 23.2s	remaining: 12.3s
654:	learn: 0.9817331	total: 23.3s	remaining: 12.3s
655:	learn: 0.9818221	total: 23.3s	remaining: 12.2s
656:	learn: 0.9818921	total: 23.3s	remaining: 12.2s
657:	learn: 0.9819307	total: 23.4s	remaining: 12.2s
658:	learn: 0.9820198	total: 23.4s	remaining: 12.1s
659:	learn: 0.9821290	total: 23.5s	remaining: 12.1s
660:	learn: 0.9820937	total: 23.5s	remaining: 12s
661:	learn: 0.9821101	total: 23.5s	remaining: 12s
662:	learn: 0.9821801	total: 23.6s	remaining: 12s
663:	learn: 0.9821612	total: 23.6s	remaining: 11.9s
664:	learn: 0.9821612	total: 23.6s	remaining: 11.9s
665:	learn: 0.9821271	total: 23.7s	remaining: 11.9s
666:	learn: 0.9822174	total: 23.7s	remaining: 11.8s
667:	learn: 0.9824681	total: 23.7s	remaining: 11.8s
668:	learn: 0.9825212	total: 23.8s	remaining: 11.8s
669:	learn: 0.9825

810:	learn: 0.9881661	total: 29.3s	remaining: 6.82s
811:	learn: 0.9882735	total: 29.3s	remaining: 6.78s
812:	learn: 0.9882731	total: 29.3s	remaining: 6.75s
813:	learn: 0.9882552	total: 29.4s	remaining: 6.71s
814:	learn: 0.9883448	total: 29.4s	remaining: 6.68s
815:	learn: 0.9883087	total: 29.5s	remaining: 6.64s
816:	learn: 0.9883436	total: 29.5s	remaining: 6.61s
817:	learn: 0.9883610	total: 29.5s	remaining: 6.57s
818:	learn: 0.9883784	total: 29.6s	remaining: 6.54s
819:	learn: 0.9883962	total: 29.6s	remaining: 6.5s
820:	learn: 0.9883253	total: 29.6s	remaining: 6.46s
821:	learn: 0.9883245	total: 29.7s	remaining: 6.43s
822:	learn: 0.9883971	total: 29.7s	remaining: 6.39s
823:	learn: 0.9884867	total: 29.8s	remaining: 6.35s
824:	learn: 0.9884323	total: 29.8s	remaining: 6.32s
825:	learn: 0.9884854	total: 29.8s	remaining: 6.28s
826:	learn: 0.9884680	total: 29.9s	remaining: 6.25s
827:	learn: 0.9884506	total: 29.9s	remaining: 6.21s
828:	learn: 0.9885211	total: 29.9s	remaining: 6.18s
829:	learn: 0

971:	learn: 0.9916659	total: 35.1s	remaining: 1.01s
972:	learn: 0.9916304	total: 35.2s	remaining: 976ms
973:	learn: 0.9916841	total: 35.2s	remaining: 940ms
974:	learn: 0.9917200	total: 35.2s	remaining: 904ms
975:	learn: 0.9917015	total: 35.3s	remaining: 868ms
976:	learn: 0.9917194	total: 35.3s	remaining: 831ms
977:	learn: 0.9917015	total: 35.4s	remaining: 795ms
978:	learn: 0.9916835	total: 35.4s	remaining: 759ms
979:	learn: 0.9917182	total: 35.4s	remaining: 723ms
980:	learn: 0.9917364	total: 35.5s	remaining: 687ms
981:	learn: 0.9917358	total: 35.5s	remaining: 651ms
982:	learn: 0.9917726	total: 35.5s	remaining: 614ms
983:	learn: 0.9917367	total: 35.6s	remaining: 578ms
984:	learn: 0.9917550	total: 35.6s	remaining: 542ms
985:	learn: 0.9917735	total: 35.6s	remaining: 506ms
986:	learn: 0.9917911	total: 35.7s	remaining: 470ms
987:	learn: 0.9917917	total: 35.7s	remaining: 434ms
988:	learn: 0.9917917	total: 35.7s	remaining: 397ms
989:	learn: 0.9918446	total: 35.8s	remaining: 361ms
990:	learn: 

<catboost.core.CatBoostClassifier at 0x190a9edfeb0>

In [82]:
#Predict results
res = model.predict(X_test)

In [83]:
#F1 score
f1_score(y_test, res)

0.8844301184310601

In [68]:
#CHeck unique values
np.unique(res, return_counts=True)

(array([0., 1.]), array([ 6479, 13981], dtype=int64))

In [69]:
#accuracy
sum(res == y_test)/len(y_test)

0.8403714565004887

In [84]:
test_res = model.predict(test_data[cols])

In [85]:
np.unique(test_res,return_counts=True)

(array([0., 1.]), array([530, 670], dtype=int64))

In [98]:
generate_result(test_res)

#### Random Forest Based method (best in leaderboard)

In [99]:
from sklearn.ensemble import RandomForestClassifier

In [100]:
#Params from grid search CV in the Grid Search Notebook
clf_rf_grid_srch1 = RandomForestClassifier(n_estimators=800, n_jobs=-1, random_state=42, class_weight={0.0: 1.7, 1.0: 1}, criterion='entropy', max_depth=20, max_features='auto')

In [102]:
clf_rf_grid_srch1.fit(X,y)
#Best classifier with f1 of 0.99757

RandomForestClassifier(class_weight={0.0: 1.7, 1.0: 1}, criterion='entropy',
                       max_depth=20, n_estimators=800, n_jobs=-1,
                       random_state=42)

In [114]:
test_res = rf1.predict(test_data[cols])
generate_result(test_res, "Result.csv")