# SuperAI Season 4 - Level 2 Hackathon - Forest Type Classification

## Explore Data

In [22]:
import pandas as pd
import numpy as np

In [23]:
train_df = pd.read_csv('./datasets/train.csv' )
test_df = pd.read_csv('./datasets/test.csv' , index_col='id')

In [24]:
train_df

Unnamed: 0,id,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
0,2002,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,3212,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,13312,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,17020,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,5967,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13048,9185,374,1940,1054,382,565,498,977,1678,1929,2109,2291,2100,DDF
13049,13977,1983,3602,2720,1622,1782,1766,2314,3488,3900,3924,4097,6053,DDF
13050,755,940,2007,1148,975,1080,968,1252,1780,1983,1942,2247,2170,DDF
13051,1616,1174,2312,1190,1112,1126,889,1310,2511,3085,3050,3396,3380,MDF


In [25]:
test_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13467,69,1425,693,312,524,376,847,1821,2356,2378,2611,2595
12719,242,1514,691,343,522,324,718,1730,2178,2472,2359,2582
1054,218,2354,1118,292,596,410,965,2586,3226,3371,3645,3149
13747,350,2013,1134,306,572,475,982,1754,1935,2275,2290,2345
9453,185,1450,712,293,440,384,673,1487,1965,2213,2200,2193
...,...,...,...,...,...,...,...,...,...,...,...,...
115,447,1686,811,425,661,441,958,2432,2891,2966,3126,3312
10654,252,2694,1503,470,778,753,1294,2334,2656,2679,3212,2856
5718,233,1486,618,249,409,260,699,2188,2831,3030,3086,3087
13054,221,1840,774,245,441,231,703,2491,3453,3284,3762,3161


## Oversampling And Features Engineer

In [26]:
def oversampling (df) :

    df_oversampling = df[df['nforest_type'] == 'DEF']

    return pd.concat([df , df_oversampling])

In [27]:
train_df = oversampling(train_df)

In [28]:
train_df['id'] = range(0 , len(train_df))

In [29]:
train_df = train_df.set_index('id')

In [30]:
train_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15633,182,1336,574,188,296,145,571,2000,2473,2243,2884,2844,DEF
15634,1,1753,820,106,431,301,840,2091,2472,2762,2914,2440,DEF
15635,281,1837,835,342,718,285,975,2801,3556,3637,3801,3148,DEF
15636,1656,1955,965,1792,1972,1786,2013,2426,2637,2570,2839,2825,DEF


In [31]:
train_df['nforest_type'].value_counts()

nforest_type
MDF    5865
DEF    5170
DDF    4603
Name: count, dtype: int64

In [32]:
def add_features(row) :

    row['NDVI'] = (row['b8'] - row['b4']) / (row['b8'] + row['b4'])
    row['EVI'] = 2.5 * ((row['b8'] - row['b4']) / (row['b8'] + 6 * row['b4'] - 7.5 * row['b2'] + 1.01))
    row['NDWI '] = (row['b3'] - row['b8']) / (row['b3'] + row['b8'])
    row['SAVI '] = (row['b8'] - row['b4']) * (1 + 0.5) / (row['b8'] + row['b4'] + 0.5)
    row['MSAVI'] = (2 * row['b8'] + 1 - ( (2 * row['b8'] + 1) ** 2 - 8 * (row['b8'] - row['b4'])) ** (1 / 2)) / 2
    row['GNDVI '] = (row['b8'] - row['b3']) / (row['b8'] + row['b3'])
    row['RENDVI '] = (row['b8'] - row['b5']) / (row['b8'] + row['b5'])
    row['NDMI '] = (row['b8'] - row['b11']) / (row['b8'] + row['b11'])
    row['GRVI'] = (row['b3'] - row['b4']) / (row['b3'] + row['b4'])
    row['TVI'] = ( (row['b8'] - row['b4']) / (row['b8'] + row['b4'] + 0.5) ) ** (1 / 2)
    row['MCARI'] = ((row['b5'] - row['b4']) - 0.2 * (row['b5'] - row['b3'])) / (row['b5'] / row['b4'])
    row['BSI'] =  ((row['b11'] + row['b4']) - (row['b8'] + row['b2'])) / ((row['b11'] + row['b4']) + (row['b8'] + row['b2']))
    row['NBR'] = (row['b8'] - row['b12']) / (row['b8'] + row['b12'])
    row['MSI'] = row['b11'] / row['b8']

    return row

In [33]:
train_df = train_df.apply(add_features , axis = 1)
test_df = test_df.apply(add_features , axis = 1)

## Resampling

In [44]:
from imblearn.over_sampling import SMOTE

# Assuming X_train and y_train are your training data and labels
smote = SMOTE(random_state=42)

train_df , label_df  = smote.fit_resample(train_df.drop(columns=['nforest_type']) , train_df['nforest_type'])

In [45]:
train_df

Unnamed: 0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,...,MSAVI,GNDVI,RENDVI,NDMI,GRVI,TVI,MCARI,BSI,NBR,MSI
0,293,1927,1038,278,475,453,987,1773,2184,1900,...,0.761531,0.600000,0.316245,-0.007055,0.023707,0.784110,198.089970,0.044318,0.293397,1.014211
1,197,1598,697,201,347,228,682,1982,2449,2254,...,0.898826,0.733180,0.535422,0.170301,0.206957,0.903390,129.378299,-0.146928,0.527618,0.708962
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,...,0.642092,0.402110,0.324474,0.095489,0.087420,0.687629,227.331148,-0.087510,0.397604,0.825669
3,132,1560,689,189,408,175,609,2117,2907,3024,...,0.942121,0.762238,0.664740,0.319372,0.399657,0.943637,113.160920,-0.298707,0.628872,0.515873
4,241,1944,1131,362,538,487,918,1549,1844,1702,...,0.713806,0.519643,0.299237,-0.066374,0.049756,0.744930,188.327887,0.081646,0.201553,1.142186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17590,167,1310,645,220,389,259,588,1538,2015,2140,...,0.878934,0.692139,0.568634,0.240784,0.201638,0.885399,127.327323,-0.201631,0.536628,0.611953
17591,368,2265,1158,589,826,784,1318,2311,2669,2983,...,0.736196,0.566452,0.387341,0.136692,0.030173,0.763644,256.655452,-0.079302,0.440882,0.759567
17592,202,1306,581,180,381,201,494,2161,2826,3353,...,0.940045,0.795929,0.743177,0.439365,0.309278,0.941681,110.021053,-0.401984,0.704626,0.389502
17593,140,1315,549,222,351,216,469,1708,2081,2747,...,0.920959,0.773124,0.708253,0.352357,0.236888,0.923795,105.530532,-0.319095,0.666806,0.478910


In [46]:
label_df

0        MDF
1        DDF
2        MDF
3        MDF
4        MDF
        ... 
17590    DEF
17591    DEF
17592    DEF
17593    DEF
17594    DEF
Name: nforest_type, Length: 17595, dtype: object

In [47]:
train_df = train_df.join(label_df)

In [48]:
train_df

Unnamed: 0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,...,GNDVI,RENDVI,NDMI,GRVI,TVI,MCARI,BSI,NBR,MSI,nforest_type
0,293,1927,1038,278,475,453,987,1773,2184,1900,...,0.600000,0.316245,-0.007055,0.023707,0.784110,198.089970,0.044318,0.293397,1.014211,MDF
1,197,1598,697,201,347,228,682,1982,2449,2254,...,0.733180,0.535422,0.170301,0.206957,0.903390,129.378299,-0.146928,0.527618,0.708962,DDF
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,...,0.402110,0.324474,0.095489,0.087420,0.687629,227.331148,-0.087510,0.397604,0.825669,MDF
3,132,1560,689,189,408,175,609,2117,2907,3024,...,0.762238,0.664740,0.319372,0.399657,0.943637,113.160920,-0.298707,0.628872,0.515873,MDF
4,241,1944,1131,362,538,487,918,1549,1844,1702,...,0.519643,0.299237,-0.066374,0.049756,0.744930,188.327887,0.081646,0.201553,1.142186,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17590,167,1310,645,220,389,259,588,1538,2015,2140,...,0.692139,0.568634,0.240784,0.201638,0.885399,127.327323,-0.201631,0.536628,0.611953,DEF
17591,368,2265,1158,589,826,784,1318,2311,2669,2983,...,0.566452,0.387341,0.136692,0.030173,0.763644,256.655452,-0.079302,0.440882,0.759567,DEF
17592,202,1306,581,180,381,201,494,2161,2826,3353,...,0.795929,0.743177,0.439365,0.309278,0.941681,110.021053,-0.401984,0.704626,0.389502,DEF
17593,140,1315,549,222,351,216,469,1708,2081,2747,...,0.773124,0.708253,0.352357,0.236888,0.923795,105.530532,-0.319095,0.666806,0.478910,DEF


In [52]:
train_df['nforest_type'].value_counts()

nforest_type
MDF    5865
DDF    5865
DEF    5865
Name: count, dtype: int64

## AutoGluon

In [53]:
from autogluon.tabular import TabularPredictor

In [54]:
label = 'nforest_type'

In [55]:
predictor = TabularPredictor(label = label).fit(train_df)

No path specified. Models will be saved in: "AutogluonModels\ag-20240603_101157"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240603_101157"
AutoGluon Version:  1.1.0
Python Version:     3.11.7
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       1.08 GB / 15.28 GB (7.0%)
Disk Space Avail:   627.8

[1000]	valid_set's multi_error: 0.228977
[2000]	valid_set's multi_error: 0.200568
[3000]	valid_set's multi_error: 0.189773
[4000]	valid_set's multi_error: 0.182955
[5000]	valid_set's multi_error: 0.182386
[6000]	valid_set's multi_error: 0.18125
[7000]	valid_set's multi_error: 0.18125
[8000]	valid_set's multi_error: 0.180682


	0.8216	 = Validation score   (accuracy)
	17.44s	 = Training   runtime
	0.63s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's multi_error: 0.194318
[2000]	valid_set's multi_error: 0.189773


	0.817	 = Validation score   (accuracy)
	5.7s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.7937	 = Validation score   (accuracy)
	2.75s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.7966	 = Validation score   (accuracy)
	3.76s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	0.8142	 = Validation score   (accuracy)
	207.89s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.7898	 = Validation score   (accuracy)
	0.52s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.7892	 = Validation score   (accuracy)
	0.58s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
	0.8102	 = Validation score   (accuracy)
	10.08s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.8085	 = Validation score   (accuracy)
	73.74s	 = Training   runtime
	0.02s	

[1000]	valid_set's multi_error: 0.182386


	0.8193	 = Validation score   (accuracy)
	9.69s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'KNeighborsDist': 0.333, 'LightGBMXT': 0.333, 'NeuralNetTorch': 0.333}
	0.8375	 = Validation score   (accuracy)
	0.16s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 348.16s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240603_101157")


In [56]:
prediction = predictor.predict(test_df)

In [57]:
prediction

id
13467    DEF
12719    DDF
1054     MDF
13747    DDF
9453     DEF
        ... 
115      MDF
10654    MDF
5718     DDF
13054    MDF
6539     DEF
Name: nforest_type, Length: 4000, dtype: object

In [58]:
prediction.value_counts()

nforest_type
MDF    1546
DDF    1529
DEF     925
Name: count, dtype: int64

In [60]:
submission_path = './submissions'
prediction.to_csv(f'{submission_path}/submission_over_sampling_over_features_smote.csv')