# SuperAI Season 4 - Level 2 Hackathon - Forest Type Classification

## Explore Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('./datasets/train.csv' )
test_df = pd.read_csv('./datasets/test.csv' , index_col='id')

In [3]:
train_df

Unnamed: 0,id,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
0,2002,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,3212,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,13312,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,17020,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,5967,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13048,9185,374,1940,1054,382,565,498,977,1678,1929,2109,2291,2100,DDF
13049,13977,1983,3602,2720,1622,1782,1766,2314,3488,3900,3924,4097,6053,DDF
13050,755,940,2007,1148,975,1080,968,1252,1780,1983,1942,2247,2170,DDF
13051,1616,1174,2312,1190,1112,1126,889,1310,2511,3085,3050,3396,3380,MDF


In [4]:
test_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13467,69,1425,693,312,524,376,847,1821,2356,2378,2611,2595
12719,242,1514,691,343,522,324,718,1730,2178,2472,2359,2582
1054,218,2354,1118,292,596,410,965,2586,3226,3371,3645,3149
13747,350,2013,1134,306,572,475,982,1754,1935,2275,2290,2345
9453,185,1450,712,293,440,384,673,1487,1965,2213,2200,2193
...,...,...,...,...,...,...,...,...,...,...,...,...
115,447,1686,811,425,661,441,958,2432,2891,2966,3126,3312
10654,252,2694,1503,470,778,753,1294,2334,2656,2679,3212,2856
5718,233,1486,618,249,409,260,699,2188,2831,3030,3086,3087
13054,221,1840,774,245,441,231,703,2491,3453,3284,3762,3161


## Features Engineer / SMOTE

In [5]:
train_df = train_df.set_index('id')

In [6]:
train_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2002,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
3212,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
13312,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
17020,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
5967,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9185,374,1940,1054,382,565,498,977,1678,1929,2109,2291,2100,DDF
13977,1983,3602,2720,1622,1782,1766,2314,3488,3900,3924,4097,6053,DDF
755,940,2007,1148,975,1080,968,1252,1780,1983,1942,2247,2170,DDF
1616,1174,2312,1190,1112,1126,889,1310,2511,3085,3050,3396,3380,MDF


In [7]:
train_df['nforest_type'].value_counts()

nforest_type
MDF    5865
DDF    4603
DEF    2585
Name: count, dtype: int64

In [8]:
from imblearn.over_sampling import SMOTE , KMeansSMOTE
from imblearn.combine import SMOTEENN , SMOTETomek

smote = SMOTE(random_state = 42 , sampling_strategy= 'all')

train_df , label_df  = smote.fit_resample(train_df.drop(columns=['nforest_type']) , train_df['nforest_type'])
train_df = train_df.join(label_df)
train_df['nforest_type'].value_counts()

nforest_type
MDF    5865
DDF    5865
DEF    5865
Name: count, dtype: int64

In [9]:
def add_features(row) :

    
    row['NDVI'] = (row['b8'] - row['b4']) / (row['b8'] + row['b4'])
    row['EVI'] = 2.5 * ((row['b8'] - row['b4']) / (row['b8'] + 6 * row['b4'] - 7.5 * row['b2'] + 1.01))
    row['NDWI '] = (row['b3'] - row['b8']) / (row['b3'] + row['b8'])
    row['SAVI '] = (row['b8'] - row['b4']) * (1 + 0.5) / (row['b8'] + row['b4'] + 0.5)
    row['MSAVI'] = (2 * row['b8'] + 1 - ( (2 * row['b8'] + 1) ** 2 - 8 * (row['b8'] - row['b4'])) ** (1 / 2)) / 2
    row['GNDVI '] = (row['b8'] - row['b3']) / (row['b8'] + row['b3'])
    row['RENDVI '] = (row['b8'] - row['b5']) / (row['b8'] + row['b5'])
    row['NDMI '] = (row['b8'] - row['b11']) / (row['b8'] + row['b11'])
    row['GRVI'] = (row['b3'] - row['b4']) / (row['b3'] + row['b4'])
    row['TVI'] = ( (row['b8'] - row['b4']) / (row['b8'] + row['b4'] + 0.5) ) ** (1 / 2)
    row['MCARI'] = ((row['b5'] - row['b4']) - 0.2 * (row['b5'] - row['b3'])) / (row['b5'] / row['b4'])
    row['BSI'] =  ((row['b11'] + row['b4']) - (row['b8'] + row['b2'])) / ((row['b11'] + row['b4']) + (row['b8'] + row['b2']))
    row['NBR'] = (row['b8'] - row['b12']) / (row['b8'] + row['b12'])
    row['MSI'] = row['b11'] / row['b8']
    
    return row

In [10]:
train_df = train_df.apply(add_features , axis = 1)
test_df = test_df.apply(add_features , axis = 1)

In [11]:
train_df

Unnamed: 0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,...,MSAVI,GNDVI,RENDVI,NDMI,GRVI,TVI,MCARI,BSI,NBR,MSI
0,293,1927,1038,278,475,453,987,1773,2184,1900,...,0.761531,0.600000,0.316245,-0.007055,0.023707,0.784110,198.089970,0.044318,0.293397,1.014211
1,197,1598,697,201,347,228,682,1982,2449,2254,...,0.898826,0.733180,0.535422,0.170301,0.206957,0.903390,129.378299,-0.146928,0.527618,0.708962
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,...,0.642092,0.402110,0.324474,0.095489,0.087420,0.687629,227.331148,-0.087510,0.397604,0.825669
3,132,1560,689,189,408,175,609,2117,2907,3024,...,0.942121,0.762238,0.664740,0.319372,0.399657,0.943637,113.160920,-0.298707,0.628872,0.515873
4,241,1944,1131,362,538,487,918,1549,1844,1702,...,0.713806,0.519643,0.299237,-0.066374,0.049756,0.744930,188.327887,0.081646,0.201553,1.142186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17590,102,1567,693,210,436,265,706,2227,2717,2951,...,0.910186,0.742545,0.613891,0.306330,0.243937,0.913821,145.262040,-0.266173,0.619649,0.531006
17591,1700,1926,965,1918,1946,1860,2074,2528,2726,2692,...,0.309024,0.160845,0.129668,0.165873,0.022596,0.427501,168.960463,-0.098142,0.472245,0.715453
17592,252,1940,990,375,626,471,960,2049,2372,2987,...,0.842294,0.653474,0.513555,0.212503,0.141294,0.852926,207.141875,-0.164732,0.502137,0.649481
17593,419,2185,1225,478,654,647,1049,1862,2167,2183,...,0.703571,0.538950,0.350866,-0.000458,0.005380,0.736655,199.219256,0.031131,0.281103,1.000916


In [12]:
train_df.columns

Index(['b1', 'b11', 'b12', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b8_a',
       'b9', 'nforest_type', 'NDVI', 'EVI', 'NDWI ', 'SAVI ', 'MSAVI',
       'GNDVI ', 'RENDVI ', 'NDMI ', 'GRVI', 'TVI', 'MCARI', 'BSI', 'NBR',
       'MSI'],
      dtype='object')

In [22]:
id2label = {
    0 : 'MDF' ,
    1 : 'DDF' ,
    2 : 'DEF' 
}
label2id = {
    'MDF' : 0 ,
    'DDF' : 1 ,
    'DEF' : 2 
}

In [23]:
def convert_label (row) :
    
    row['nforest_type'] = label2id[row['nforest_type']]
    
    return row

In [None]:
train_df = train_df.apply(convert_label , axis = 1)

## Catboost Classifier

In [56]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [132]:
catboost_model = CatBoostClassifier(learning_rate = 0.1, depth = 12 , n_estimators = 2000 , random_seed = 42)

In [133]:
x_train, x_test, y_train, y_test = train_test_split(train_df.drop(columns = 'nforest_type') , train_df['nforest_type'] , test_size = 0.20 , random_state = 42)

In [134]:
catboost_model.fit(x_train , y_train , verbose = 100)

0:	learn: 1.0612791	total: 368ms	remaining: 12m 15s
100:	learn: 0.5447309	total: 33.5s	remaining: 10m 30s
200:	learn: 0.3987076	total: 1m 5s	remaining: 9m 49s
300:	learn: 0.3072528	total: 1m 38s	remaining: 9m 14s
400:	learn: 0.2453936	total: 2m 10s	remaining: 8m 40s
500:	learn: 0.2006882	total: 2m 43s	remaining: 8m 9s
600:	learn: 0.1673098	total: 3m 16s	remaining: 7m 37s
700:	learn: 0.1418107	total: 3m 49s	remaining: 7m 5s
800:	learn: 0.1215223	total: 4m 22s	remaining: 6m 33s
900:	learn: 0.1053827	total: 4m 55s	remaining: 6m
1000:	learn: 0.0936261	total: 5m 28s	remaining: 5m 28s
1100:	learn: 0.0834695	total: 6m 1s	remaining: 4m 55s
1200:	learn: 0.0745525	total: 6m 33s	remaining: 4m 21s
1300:	learn: 0.0672307	total: 7m 5s	remaining: 3m 48s
1400:	learn: 0.0609461	total: 7m 37s	remaining: 3m 15s
1500:	learn: 0.0556334	total: 8m 9s	remaining: 2m 42s
1600:	learn: 0.0510121	total: 8m 41s	remaining: 2m 9s
1700:	learn: 0.0470197	total: 9m 13s	remaining: 1m 37s
1800:	learn: 0.0434707	total: 9m 

<catboost.core.CatBoostClassifier at 0x1708cca6650>

In [136]:
print('accuracy score =' , accuracy_score(catboost_model.predict(x_test) , y_test))

accuracy score = 0.789712986643933


In [137]:
catboost_model.predict(test_df)

array([[2],
       [0],
       [0],
       ...,
       [2],
       [0],
       [2]], dtype=int64)

## Xgboost

In [139]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [140]:
xgboost_model = XGBClassifier(n_estimators = 2000 , max_depth = 12, learning_rate = 0.1 , objective = 'multi:softmax')

In [141]:
x_train, x_test, y_train, y_test = train_test_split(train_df.drop(columns = 'nforest_type') , train_df['nforest_type'] , test_size = 0.20 , random_state = 42)

In [142]:
xgboost_model.fit(x_train , y_train)


In [143]:
print('accuracy score =' , accuracy_score(xgboost_model.predict(x_test) , y_test))

accuracy score = 0.7908496732026143


In [145]:
xgboost_model.predict(test_df)

array([2, 0, 0, ..., 2, 0, 2])

## Random Forest Classifier

## LightGBM

## AutoGluon

In [13]:
from autogluon.tabular import TabularPredictor

In [14]:
label = 'nforest_type'

In [15]:
predictor = TabularPredictor(label = label).fit(train_df)

No path specified. Models will be saved in: "AutogluonModels\ag-20240608_070541"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240608_070541"
AutoGluon Version:  1.1.0
Python Version:     3.11.7
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       1.99 GB / 15.28 GB (13.0%)
Disk Space Avail:   540.

[1000]	valid_set's multi_error: 0.236364
[2000]	valid_set's multi_error: 0.217045
[3000]	valid_set's multi_error: 0.213068
[4000]	valid_set's multi_error: 0.211932
[5000]	valid_set's multi_error: 0.208523
[6000]	valid_set's multi_error: 0.209659


	0.7943	 = Validation score   (accuracy)
	33.41s	 = Training   runtime
	0.81s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's multi_error: 0.226136


	0.7852	 = Validation score   (accuracy)
	11.85s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.7602	 = Validation score   (accuracy)
	3.53s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.7614	 = Validation score   (accuracy)
	4.98s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: CatBoost ...
	0.792	 = Validation score   (accuracy)
	356.89s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.7659	 = Validation score   (accuracy)
	1.24s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.7534	 = Validation score   (accuracy)
	0.96s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: XGBoost ...
	0.7807	 = Validation score   (accuracy)
	4.6s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.7795	 = Validation score   (accuracy)
	44.0s	 = Training   runtime
	0.02s	 

In [16]:
predictor.refit_full()

Refitting models via `predictor.refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix "_FULL" and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `predictor.fit` call.
	To learn more, refer to the `.refit_full` method docstring which explains how "_FULL" models differ from normal models.
Fitting 1 L1 models ...
Fitting model: KNeighborsUnif_FULL ...
	0.02s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: KNeighborsDist_FULL ...
	0.02s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: NeuralNetFastAI_FULL ...
No improvement since epoch 0: early stopping
	9.54s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: LightGBMXT_FULL ...
	13.45s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: LightGBM_FULL ...
	4.79s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: RandomForestGini_FULL ...
	3.25s	 = Training  

{'KNeighborsUnif': 'KNeighborsUnif_FULL',
 'KNeighborsDist': 'KNeighborsDist_FULL',
 'NeuralNetFastAI': 'NeuralNetFastAI_FULL',
 'LightGBMXT': 'LightGBMXT_FULL',
 'LightGBM': 'LightGBM_FULL',
 'RandomForestGini': 'RandomForestGini_FULL',
 'RandomForestEntr': 'RandomForestEntr_FULL',
 'CatBoost': 'CatBoost_FULL',
 'ExtraTreesGini': 'ExtraTreesGini_FULL',
 'ExtraTreesEntr': 'ExtraTreesEntr_FULL',
 'XGBoost': 'XGBoost_FULL',
 'NeuralNetTorch': 'NeuralNetTorch_FULL',
 'LightGBMLarge': 'LightGBMLarge_FULL',
 'WeightedEnsemble_L2': 'WeightedEnsemble_L2_FULL'}

In [17]:
predictor.fit_pseudolabel(test_df)

Given test_data for pseudo labeling did not contain labels. AutoGluon will assign pseudo labels to data and use it for extra training data...
Beginning iteration 1 of pseudolabeling out of max 3
Pseudolabeling algorithm confidently assigned pseudolabels to 18 rows of data on iteration 1. Adding to train data
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_PSEUDO_1 ...
	0.7159	 = Validation score   (accuracy)
	0.02s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: KNeighborsDist_PSEUDO_1 ...
	0.7551	 = Validation score   (accuracy)
	0.04s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: NeuralNetFastAI_PSEUDO_1 ...
	0.767	 = Validation score   (accuracy)
	16.52s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: LightGBMXT_PSEUDO_1 ...


[1000]	valid_set's multi_error: 0.225568
[2000]	valid_set's multi_error: 0.214773
[3000]	valid_set's multi_error: 0.206818
[4000]	valid_set's multi_error: 0.202841
[5000]	valid_set's multi_error: 0.202841


	0.8006	 = Validation score   (accuracy)
	14.76s	 = Training   runtime
	0.46s	 = Validation runtime
Fitting model: LightGBM_PSEUDO_1 ...


[1000]	valid_set's multi_error: 0.222159
[2000]	valid_set's multi_error: 0.213068


	0.7898	 = Validation score   (accuracy)
	6.38s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: RandomForestGini_PSEUDO_1 ...
	0.7636	 = Validation score   (accuracy)
	3.03s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: RandomForestEntr_PSEUDO_1 ...
	0.7591	 = Validation score   (accuracy)
	4.55s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: CatBoost_PSEUDO_1 ...
	0.7892	 = Validation score   (accuracy)
	297.56s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: ExtraTreesGini_PSEUDO_1 ...
	0.7551	 = Validation score   (accuracy)
	1.29s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: ExtraTreesEntr_PSEUDO_1 ...
	0.7597	 = Validation score   (accuracy)
	1.13s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: XGBoost_PSEUDO_1 ...
	0.7926	 = Validation score   (accuracy)
	16.53s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetTorch_PSEUDO_1 ...
	0.7812	 = Va

[1000]	valid_set's multi_error: 0.208523


	0.7937	 = Validation score   (accuracy)
	12.33s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: WeightedEnsemble_L2_PSEUDO_1 ...
	Ensemble Weights: {'KNeighborsDist_PSEUDO_1': 0.231, 'LightGBMXT_PSEUDO_1': 0.231, 'ExtraTreesGini_PSEUDO_1': 0.154, 'ExtraTreesEntr_PSEUDO_1': 0.154, 'NeuralNetFastAI_PSEUDO_1': 0.077, 'RandomForestGini_PSEUDO_1': 0.077, 'NeuralNetTorch_PSEUDO_1': 0.077}
	0.8108	 = Validation score   (accuracy)
	0.15s	 = Training   runtime
	0.0s	 = Validation runtime
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240608_070541")
Pseudolabeling algorithm changed validation score from: 0.8096590909090909, to: 0.8096590909090909 using evaluation metric: accuracy


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x29bff936b90>

In [18]:
prediction = predictor.predict(test_df)

In [19]:
prediction

id
13467    DEF
12719    MDF
1054     MDF
13747    DDF
9453     DEF
        ... 
115      MDF
10654    MDF
5718     DDF
13054    MDF
6539     DEF
Name: nforest_type, Length: 4000, dtype: object

In [20]:
features_importance =predictor.feature_importance(train_df)
features_importance

Computing feature importance via permutation shuffling for 26 features using 5000 rows with 5 shuffle sets...
	415.33s	= Expected runtime (83.07s per shuffle set)
	255.68s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
b11,0.237,0.006192,5.585948e-08,5,0.249749,0.224251
b8_a,0.18632,0.007292,2.809461e-07,5,0.201334,0.171306
b9,0.18416,0.005183,7.523364e-08,5,0.194833,0.173487
b7,0.11564,0.004348,2.394538e-07,5,0.124593,0.106687
b6,0.097,0.004781,7.060593e-07,5,0.106845,0.087155
b8,0.07764,0.003235,3.610453e-07,5,0.084302,0.070978
b5,0.07108,0.002369,1.478373e-07,5,0.075958,0.066202
b12,0.06892,0.005444,4.631464e-06,5,0.080128,0.057712
b1,0.05492,0.001814,1.427404e-07,5,0.058656,0.051184
b2,0.0418,0.003891,8.906906e-06,5,0.049812,0.033788


In [21]:
features_importance.to_csv('./features/features3class.csv')

In [22]:
prediction.value_counts()

nforest_type
DDF    1529
MDF    1513
DEF     958
Name: count, dtype: int64

In [23]:
submission_path = './submissions'
prediction.to_csv(f'{submission_path}/submission_addfeatures_SMOTE_pseudolabeling.csv')