# SuperAI Season 4 - Level 2 Hackathon - Forest Type Classification

## Explore Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('./datasets/train.csv' )
test_df = pd.read_csv('./datasets/test.csv' , index_col='id')

In [3]:
train_df

Unnamed: 0,id,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
0,2002,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,3212,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,13312,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,17020,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,5967,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13048,9185,374,1940,1054,382,565,498,977,1678,1929,2109,2291,2100,DDF
13049,13977,1983,3602,2720,1622,1782,1766,2314,3488,3900,3924,4097,6053,DDF
13050,755,940,2007,1148,975,1080,968,1252,1780,1983,1942,2247,2170,DDF
13051,1616,1174,2312,1190,1112,1126,889,1310,2511,3085,3050,3396,3380,MDF


In [4]:
test_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13467,69,1425,693,312,524,376,847,1821,2356,2378,2611,2595
12719,242,1514,691,343,522,324,718,1730,2178,2472,2359,2582
1054,218,2354,1118,292,596,410,965,2586,3226,3371,3645,3149
13747,350,2013,1134,306,572,475,982,1754,1935,2275,2290,2345
9453,185,1450,712,293,440,384,673,1487,1965,2213,2200,2193
...,...,...,...,...,...,...,...,...,...,...,...,...
115,447,1686,811,425,661,441,958,2432,2891,2966,3126,3312
10654,252,2694,1503,470,778,753,1294,2334,2656,2679,3212,2856
5718,233,1486,618,249,409,260,699,2188,2831,3030,3086,3087
13054,221,1840,774,245,441,231,703,2491,3453,3284,3762,3161


## Features Engineer

In [5]:
train_df['id'] = range(0 , len(train_df))

In [6]:
train_df = train_df.set_index('id')

In [7]:
train_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13048,374,1940,1054,382,565,498,977,1678,1929,2109,2291,2100,DDF
13049,1983,3602,2720,1622,1782,1766,2314,3488,3900,3924,4097,6053,DDF
13050,940,2007,1148,975,1080,968,1252,1780,1983,1942,2247,2170,DDF
13051,1174,2312,1190,1112,1126,889,1310,2511,3085,3050,3396,3380,MDF


In [8]:
train_df['nforest_type'].value_counts()

nforest_type
MDF    5865
DDF    4603
DEF    2585
Name: count, dtype: int64

In [9]:
def add_features(row) :

    row['NDVI'] = (row['b8'] - row['b4']) / (row['b8'] + row['b4'])
    row['EVI'] = 2.5 * ((row['b8'] - row['b4']) / (row['b8'] + 6 * row['b4'] - 7.5 * row['b2'] + 1.01))
    row['NDWI '] = (row['b3'] - row['b8']) / (row['b3'] + row['b8'])
    row['SAVI '] = (row['b8'] - row['b4']) * (1 + 0.5) / (row['b8'] + row['b4'] + 0.5)
    row['MSAVI'] = (2 * row['b8'] + 1 - ( (2 * row['b8'] + 1) ** 2 - 8 * (row['b8'] - row['b4'])) ** (1 / 2)) / 2
    row['GNDVI '] = (row['b8'] - row['b3']) / (row['b8'] + row['b3'])
    row['RENDVI '] = (row['b8'] - row['b5']) / (row['b8'] + row['b5'])
    row['NDMI '] = (row['b8'] - row['b11']) / (row['b8'] + row['b11'])
    row['GRVI'] = (row['b3'] - row['b4']) / (row['b3'] + row['b4'])
    row['TVI'] = ( (row['b8'] - row['b4']) / (row['b8'] + row['b4'] + 0.5) ) ** (1 / 2)
    row['MCARI'] = ((row['b5'] - row['b4']) - 0.2 * (row['b5'] - row['b3'])) / (row['b5'] / row['b4'])
    row['BSI'] =  ((row['b11'] + row['b4']) - (row['b8'] + row['b2'])) / ((row['b11'] + row['b4']) + (row['b8'] + row['b2']))
    row['NBR'] = (row['b8'] - row['b12']) / (row['b8'] + row['b12'])
    row['MSI'] = row['b11'] / row['b8']

    return row

In [10]:
def drop_features(df) :

    return df.drop(columns = ['b1'])

In [11]:
train_df = train_df.apply(add_features , axis = 1)
test_df = test_df.apply(add_features , axis = 1)

In [12]:
train_df = drop_features(train_df)
test_df = drop_features(test_df)

## Resampling

In [13]:
from imblearn.over_sampling import SMOTE , KMeansSMOTE
from imblearn.combine import SMOTEENN , SMOTETomek

smote = SMOTETomek(random_state = 42 , sampling_strategy= 'all')

train_df , label_df  = smote.fit_resample(train_df.drop(columns=['nforest_type']) , train_df['nforest_type'])

[WinError 2] The system cannot find the file specified
  File "c:\anaconda\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\anaconda\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\anaconda\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\anaconda\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [14]:
train_df

Unnamed: 0,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,...,MSAVI,GNDVI,RENDVI,NDMI,GRVI,TVI,MCARI,BSI,NBR,MSI
0,1927,1038,278,475,453,987,1773,2184,1900,2343,...,0.761531,0.600000,0.316245,-0.007055,0.023707,0.784110,198.089970,0.044318,0.293397,1.014211
1,1598,697,201,347,228,682,1982,2449,2254,2685,...,0.898826,0.733180,0.535422,0.170301,0.206957,0.903390,129.378299,-0.146928,0.527618,0.708962
2,1560,689,189,408,175,609,2117,2907,3024,3005,...,0.942121,0.762238,0.664740,0.319372,0.399657,0.943637,113.160920,-0.298707,0.628872,0.515873
3,1944,1131,362,538,487,918,1549,1844,1702,2077,...,0.713806,0.519643,0.299237,-0.066374,0.049756,0.744930,188.327887,0.081646,0.201553,1.142186
4,1834,985,284,489,376,795,1802,2255,2548,2406,...,0.852409,0.677972,0.524379,0.162939,0.130636,0.861795,169.223648,-0.123364,0.442400,0.719780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16552,1567,693,210,436,265,706,2227,2717,2951,3254,...,0.909921,0.742263,0.613729,0.306132,0.247558,0.913643,144.378390,-0.266014,0.619547,0.531304
16553,1926,965,1918,1946,1860,2074,2528,2726,2692,2902,...,0.308739,0.160703,0.129585,0.165519,0.022478,0.427266,168.217077,-0.097767,0.472093,0.716410
16554,1841,951,418,631,448,937,2061,2428,3045,2799,...,0.852813,0.656537,0.529338,0.246931,0.170003,0.862172,204.209591,-0.204583,0.524301,0.604084
16555,1981,1078,476,647,492,960,1759,2166,2110,2513,...,0.766787,0.530160,0.374639,0.031568,0.136759,0.788490,207.861720,-0.022529,0.323742,0.938798


In [15]:
label_df

0        MDF
1        DDF
2        MDF
3        MDF
4        DDF
        ... 
16552    DEF
16553    DEF
16554    DEF
16555    DEF
16556    DEF
Name: nforest_type, Length: 16557, dtype: object

In [16]:
train_df = train_df.join(label_df)

In [17]:
train_df

Unnamed: 0,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,...,GNDVI,RENDVI,NDMI,GRVI,TVI,MCARI,BSI,NBR,MSI,nforest_type
0,1927,1038,278,475,453,987,1773,2184,1900,2343,...,0.600000,0.316245,-0.007055,0.023707,0.784110,198.089970,0.044318,0.293397,1.014211,MDF
1,1598,697,201,347,228,682,1982,2449,2254,2685,...,0.733180,0.535422,0.170301,0.206957,0.903390,129.378299,-0.146928,0.527618,0.708962,DDF
2,1560,689,189,408,175,609,2117,2907,3024,3005,...,0.762238,0.664740,0.319372,0.399657,0.943637,113.160920,-0.298707,0.628872,0.515873,MDF
3,1944,1131,362,538,487,918,1549,1844,1702,2077,...,0.519643,0.299237,-0.066374,0.049756,0.744930,188.327887,0.081646,0.201553,1.142186,MDF
4,1834,985,284,489,376,795,1802,2255,2548,2406,...,0.677972,0.524379,0.162939,0.130636,0.861795,169.223648,-0.123364,0.442400,0.719780,DDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16552,1567,693,210,436,265,706,2227,2717,2951,3254,...,0.742263,0.613729,0.306132,0.247558,0.913643,144.378390,-0.266014,0.619547,0.531304,DEF
16553,1926,965,1918,1946,1860,2074,2528,2726,2692,2902,...,0.160703,0.129585,0.165519,0.022478,0.427266,168.217077,-0.097767,0.472093,0.716410,DEF
16554,1841,951,418,631,448,937,2061,2428,3045,2799,...,0.656537,0.529338,0.246931,0.170003,0.862172,204.209591,-0.204583,0.524301,0.604084,DEF
16555,1981,1078,476,647,492,960,1759,2166,2110,2513,...,0.530160,0.374639,0.031568,0.136759,0.788490,207.861720,-0.022529,0.323742,0.938798,DEF


In [18]:
train_df['nforest_type'].value_counts()

nforest_type
DEF    5780
DDF    5399
MDF    5378
Name: count, dtype: int64

## AutoGluon

In [19]:
from autogluon.tabular import TabularPredictor

In [20]:
label = 'nforest_type'

In [21]:
predictor = TabularPredictor(label = label).fit(train_df)

No path specified. Models will be saved in: "AutogluonModels\ag-20240603_164042"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240603_164042"
AutoGluon Version:  1.1.0
Python Version:     3.11.7
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       0.81 GB / 15.28 GB (5.3%)
Disk Space Avail:   614.3

[1000]	valid_set's multi_error: 0.21256
[2000]	valid_set's multi_error: 0.192633
[3000]	valid_set's multi_error: 0.188406
[4000]	valid_set's multi_error: 0.182971
[5000]	valid_set's multi_error: 0.178744
[6000]	valid_set's multi_error: 0.177536
[7000]	valid_set's multi_error: 0.176329
[8000]	valid_set's multi_error: 0.172101
[9000]	valid_set's multi_error: 0.172705
[10000]	valid_set's multi_error: 0.175121


	0.8279	 = Validation score   (accuracy)
	32.39s	 = Training   runtime
	0.86s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's multi_error: 0.198068


	0.8182	 = Validation score   (accuracy)
	4.08s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.8068	 = Validation score   (accuracy)
	2.23s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.7989	 = Validation score   (accuracy)
	3.43s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	0.8207	 = Validation score   (accuracy)
	178.92s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.7977	 = Validation score   (accuracy)
	0.97s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.8025	 = Validation score   (accuracy)
	0.85s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost ...
	0.8194	 = Validation score   (accuracy)
	6.28s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.7989	 = Validation score   (accuracy)
	41.38s	 = Training   runtime
	0.02

In [22]:
predictor.fit_pseudolabel(test_df)

Given test_data for pseudo labeling did not contain labels. AutoGluon will assign pseudo labels to data and use it for extra training data...
Beginning iteration 1 of pseudolabeling out of max 3
Pseudolabeling algorithm confidently assigned pseudolabels to 387 rows of data on iteration 1. Adding to train data
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_PSEUDO_1 ...
	0.7572	 = Validation score   (accuracy)
	0.02s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: KNeighborsDist_PSEUDO_1 ...
	0.788	 = Validation score   (accuracy)
	0.02s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetFastAI_PSEUDO_1 ...
	0.7687	 = Validation score   (accuracy)
	11.34s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMXT_PSEUDO_1 ...


[1000]	valid_set's multi_error: 0.21256
[2000]	valid_set's multi_error: 0.193841
[3000]	valid_set's multi_error: 0.184179
[4000]	valid_set's multi_error: 0.175121
[5000]	valid_set's multi_error: 0.174517
[6000]	valid_set's multi_error: 0.170894
[7000]	valid_set's multi_error: 0.169686
[8000]	valid_set's multi_error: 0.171498
[9000]	valid_set's multi_error: 0.170894


	0.8315	 = Validation score   (accuracy)
	17.75s	 = Training   runtime
	0.54s	 = Validation runtime
Fitting model: LightGBM_PSEUDO_1 ...


[1000]	valid_set's multi_error: 0.199275


	0.805	 = Validation score   (accuracy)
	2.57s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestGini_PSEUDO_1 ...
	0.8086	 = Validation score   (accuracy)
	2.25s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestEntr_PSEUDO_1 ...
	0.805	 = Validation score   (accuracy)
	3.42s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost_PSEUDO_1 ...
	0.8037	 = Validation score   (accuracy)
	100.89s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini_PSEUDO_1 ...
	0.7983	 = Validation score   (accuracy)
	1.01s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: ExtraTreesEntr_PSEUDO_1 ...
	0.8056	 = Validation score   (accuracy)
	0.92s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: XGBoost_PSEUDO_1 ...
	0.8092	 = Validation score   (accuracy)
	3.71s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTorch_PSEUDO_1 ...
	0.7983	 = Vali

[1000]	valid_set's multi_error: 0.21256
[2000]	valid_set's multi_error: 0.190217
[3000]	valid_set's multi_error: 0.184179
[4000]	valid_set's multi_error: 0.179348
[5000]	valid_set's multi_error: 0.178744


	0.8225	 = Validation score   (accuracy)
	10.01s	 = Training   runtime
	0.33s	 = Validation runtime
Fitting model: LightGBM_PSEUDO_2 ...


[1000]	valid_set's multi_error: 0.199879
[2000]	valid_set's multi_error: 0.182367
[3000]	valid_set's multi_error: 0.179952
[4000]	valid_set's multi_error: 0.178744
[5000]	valid_set's multi_error: 0.178744


	0.8249	 = Validation score   (accuracy)
	11.35s	 = Training   runtime
	0.3s	 = Validation runtime
Fitting model: RandomForestGini_PSEUDO_2 ...
	0.8037	 = Validation score   (accuracy)
	2.36s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestEntr_PSEUDO_2 ...
	0.8104	 = Validation score   (accuracy)
	3.39s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost_PSEUDO_2 ...
	0.8134	 = Validation score   (accuracy)
	180.39s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini_PSEUDO_2 ...
	0.805	 = Validation score   (accuracy)
	0.9s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: ExtraTreesEntr_PSEUDO_2 ...
	0.8001	 = Validation score   (accuracy)
	0.93s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: XGBoost_PSEUDO_2 ...
	0.8213	 = Validation score   (accuracy)
	16.4s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: NeuralNetTorch_PSEUDO_2 ...
	0.7923	 = Vali

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x231e150d850>

In [23]:
prediction = predictor.predict(test_df)

In [24]:
prediction

id
13467    DEF
12719    MDF
1054     MDF
13747    DDF
9453     DEF
        ... 
115      MDF
10654    MDF
5718     DDF
13054    MDF
6539     DEF
Name: nforest_type, Length: 4000, dtype: object

In [25]:
prediction.value_counts()

nforest_type
DDF    1523
MDF    1469
DEF    1008
Name: count, dtype: int64

In [26]:
submission_path = './submissions'
prediction.to_csv(f'{submission_path}/submission_over_features_drop_features_SMOTETomek_all_pseudolabeling.csv')