# SuperAI Season 4 - Level 2 Hackathon - Forest Type Classification

## Explore Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('./datasets/train.csv' )
test_df = pd.read_csv('./datasets/test.csv' , index_col='id')

In [3]:
train_df

Unnamed: 0,id,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
0,2002,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,3212,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,13312,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,17020,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,5967,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13048,9185,374,1940,1054,382,565,498,977,1678,1929,2109,2291,2100,DDF
13049,13977,1983,3602,2720,1622,1782,1766,2314,3488,3900,3924,4097,6053,DDF
13050,755,940,2007,1148,975,1080,968,1252,1780,1983,1942,2247,2170,DDF
13051,1616,1174,2312,1190,1112,1126,889,1310,2511,3085,3050,3396,3380,MDF


In [4]:
test_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13467,69,1425,693,312,524,376,847,1821,2356,2378,2611,2595
12719,242,1514,691,343,522,324,718,1730,2178,2472,2359,2582
1054,218,2354,1118,292,596,410,965,2586,3226,3371,3645,3149
13747,350,2013,1134,306,572,475,982,1754,1935,2275,2290,2345
9453,185,1450,712,293,440,384,673,1487,1965,2213,2200,2193
...,...,...,...,...,...,...,...,...,...,...,...,...
115,447,1686,811,425,661,441,958,2432,2891,2966,3126,3312
10654,252,2694,1503,470,778,753,1294,2334,2656,2679,3212,2856
5718,233,1486,618,249,409,260,699,2188,2831,3030,3086,3087
13054,221,1840,774,245,441,231,703,2491,3453,3284,3762,3161


## Oversampling And Features Engineer

In [5]:
def oversampling (df) :

    df_oversampling = df[df['nforest_type'] == 'DEF']

    return pd.concat([df , df_oversampling])

In [6]:
# train_df = oversampling(train_df)

In [7]:
train_df['id'] = range(0 , len(train_df))

In [8]:
train_df = train_df.set_index('id')

In [9]:
train_df

Unnamed: 0_level_0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13048,374,1940,1054,382,565,498,977,1678,1929,2109,2291,2100,DDF
13049,1983,3602,2720,1622,1782,1766,2314,3488,3900,3924,4097,6053,DDF
13050,940,2007,1148,975,1080,968,1252,1780,1983,1942,2247,2170,DDF
13051,1174,2312,1190,1112,1126,889,1310,2511,3085,3050,3396,3380,MDF


In [10]:
train_df['nforest_type'].value_counts()

nforest_type
MDF    5865
DDF    4603
DEF    2585
Name: count, dtype: int64

In [11]:
def add_features(row) :

    row['NDVI'] = (row['b8'] - row['b4']) / (row['b8'] + row['b4'])
    row['EVI'] = 2.5 * ((row['b8'] - row['b4']) / (row['b8'] + 6 * row['b4'] - 7.5 * row['b2'] + 1.01))
    row['NDWI '] = (row['b3'] - row['b8']) / (row['b3'] + row['b8'])
    row['SAVI '] = (row['b8'] - row['b4']) * (1 + 0.5) / (row['b8'] + row['b4'] + 0.5)
    row['MSAVI'] = (2 * row['b8'] + 1 - ( (2 * row['b8'] + 1) ** 2 - 8 * (row['b8'] - row['b4'])) ** (1 / 2)) / 2
    row['GNDVI '] = (row['b8'] - row['b3']) / (row['b8'] + row['b3'])
    row['RENDVI '] = (row['b8'] - row['b5']) / (row['b8'] + row['b5'])
    row['NDMI '] = (row['b8'] - row['b11']) / (row['b8'] + row['b11'])
    row['GRVI'] = (row['b3'] - row['b4']) / (row['b3'] + row['b4'])
    row['TVI'] = ( (row['b8'] - row['b4']) / (row['b8'] + row['b4'] + 0.5) ) ** (1 / 2)
    row['MCARI'] = ((row['b5'] - row['b4']) - 0.2 * (row['b5'] - row['b3'])) / (row['b5'] / row['b4'])
    row['BSI'] =  ((row['b11'] + row['b4']) - (row['b8'] + row['b2'])) / ((row['b11'] + row['b4']) + (row['b8'] + row['b2']))
    row['NBR'] = (row['b8'] - row['b12']) / (row['b8'] + row['b12'])
    row['MSI'] = row['b11'] / row['b8']

    return row

In [12]:
train_df = train_df.apply(add_features , axis = 1)
test_df = test_df.apply(add_features , axis = 1)

## Resampling

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

train_df , label_df  = smote.fit_resample(train_df.drop(columns=['nforest_type']) , train_df['nforest_type'])

[WinError 2] The system cannot find the file specified
  File "c:\anaconda\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\anaconda\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\anaconda\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\anaconda\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [14]:
train_df

Unnamed: 0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,...,MSAVI,GNDVI,RENDVI,NDMI,GRVI,TVI,MCARI,BSI,NBR,MSI
0,293,1927,1038,278,475,453,987,1773,2184,1900,...,0.761531,0.600000,0.316245,-0.007055,0.023707,0.784110,198.089970,0.044318,0.293397,1.014211
1,197,1598,697,201,347,228,682,1982,2449,2254,...,0.898826,0.733180,0.535422,0.170301,0.206957,0.903390,129.378299,-0.146928,0.527618,0.708962
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,...,0.642092,0.402110,0.324474,0.095489,0.087420,0.687629,227.331148,-0.087510,0.397604,0.825669
3,132,1560,689,189,408,175,609,2117,2907,3024,...,0.942121,0.762238,0.664740,0.319372,0.399657,0.943637,113.160920,-0.298707,0.628872,0.515873
4,241,1944,1131,362,538,487,918,1549,1844,1702,...,0.713806,0.519643,0.299237,-0.066374,0.049756,0.744930,188.327887,0.081646,0.201553,1.142186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17590,102,1567,693,210,436,265,706,2227,2717,2951,...,0.909921,0.742263,0.613729,0.306132,0.247558,0.913643,144.378390,-0.266014,0.619547,0.531304
17591,1700,1926,965,1918,1946,1860,2074,2528,2726,2692,...,0.308739,0.160703,0.129585,0.165519,0.022478,0.427266,168.217077,-0.097767,0.472093,0.716410
17592,252,1940,990,375,626,471,960,2049,2372,2987,...,0.841729,0.652994,0.512947,0.211767,0.140737,0.852448,207.153619,-0.163285,0.501369,0.651555
17593,419,2185,1225,478,654,647,1049,1862,2167,2183,...,0.703195,0.538463,0.350523,-0.000613,0.005448,0.736352,199.339162,0.031231,0.280808,1.001235


In [15]:
label_df

0        MDF
1        DDF
2        MDF
3        MDF
4        MDF
        ... 
17590    DEF
17591    DEF
17592    DEF
17593    DEF
17594    DEF
Name: nforest_type, Length: 17595, dtype: object

In [16]:
train_df = train_df.join(label_df)

In [17]:
train_df

Unnamed: 0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,...,GNDVI,RENDVI,NDMI,GRVI,TVI,MCARI,BSI,NBR,MSI,nforest_type
0,293,1927,1038,278,475,453,987,1773,2184,1900,...,0.600000,0.316245,-0.007055,0.023707,0.784110,198.089970,0.044318,0.293397,1.014211,MDF
1,197,1598,697,201,347,228,682,1982,2449,2254,...,0.733180,0.535422,0.170301,0.206957,0.903390,129.378299,-0.146928,0.527618,0.708962,DDF
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,...,0.402110,0.324474,0.095489,0.087420,0.687629,227.331148,-0.087510,0.397604,0.825669,MDF
3,132,1560,689,189,408,175,609,2117,2907,3024,...,0.762238,0.664740,0.319372,0.399657,0.943637,113.160920,-0.298707,0.628872,0.515873,MDF
4,241,1944,1131,362,538,487,918,1549,1844,1702,...,0.519643,0.299237,-0.066374,0.049756,0.744930,188.327887,0.081646,0.201553,1.142186,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17590,102,1567,693,210,436,265,706,2227,2717,2951,...,0.742263,0.613729,0.306132,0.247558,0.913643,144.378390,-0.266014,0.619547,0.531304,DEF
17591,1700,1926,965,1918,1946,1860,2074,2528,2726,2692,...,0.160703,0.129585,0.165519,0.022478,0.427266,168.217077,-0.097767,0.472093,0.716410,DEF
17592,252,1940,990,375,626,471,960,2049,2372,2987,...,0.652994,0.512947,0.211767,0.140737,0.852448,207.153619,-0.163285,0.501369,0.651555,DEF
17593,419,2185,1225,478,654,647,1049,1862,2167,2183,...,0.538463,0.350523,-0.000613,0.005448,0.736352,199.339162,0.031231,0.280808,1.001235,DEF


In [18]:
train_df['nforest_type'].value_counts()

nforest_type
MDF    5865
DDF    5865
DEF    5865
Name: count, dtype: int64

## AutoGluon

In [19]:
from autogluon.tabular import TabularPredictor

In [20]:
label = 'nforest_type'

In [21]:
predictor = TabularPredictor(label = label).fit(train_df)

No path specified. Models will be saved in: "AutogluonModels\ag-20240603_115558"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240603_115558"
AutoGluon Version:  1.1.0
Python Version:     3.11.7
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       0.51 GB / 15.28 GB (3.3%)
Disk Space Avail:   626.6

[1000]	valid_set's multi_error: 0.211932


	0.7892	 = Validation score   (accuracy)
	8.8s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'NeuralNetTorch': 0.261, 'RandomForestGini': 0.174, 'ExtraTreesEntr': 0.174, 'KNeighborsDist': 0.13, 'NeuralNetFastAI': 0.13, 'ExtraTreesGini': 0.087, 'XGBoost': 0.043}
	0.808	 = Validation score   (accuracy)
	0.09s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 296.37s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240603_115558")


In [26]:
predictor.fit_pseudolabel(test_df)

Given test_data for pseudo labeling did not contain labels. AutoGluon will assign pseudo labels to data and use it for extra training data...
Beginning iteration 1 of pseudolabeling out of max 3
Pseudolabeling algorithm confidently assigned pseudolabels to 24 rows of data on iteration 1. Adding to train data
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_PSEUDO_1 ...
	0.7159	 = Validation score   (accuracy)
	0.03s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: KNeighborsDist_PSEUDO_1 ...
	0.7562	 = Validation score   (accuracy)
	0.02s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetFastAI_PSEUDO_1 ...
	0.7716	 = Validation score   (accuracy)
	11.76s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMXT_PSEUDO_1 ...


[1000]	valid_set's multi_error: 0.228977
[2000]	valid_set's multi_error: 0.222727
[3000]	valid_set's multi_error: 0.216477
[4000]	valid_set's multi_error: 0.213636


	0.7909	 = Validation score   (accuracy)
	10.72s	 = Training   runtime
	0.33s	 = Validation runtime
Fitting model: LightGBM_PSEUDO_1 ...


[1000]	valid_set's multi_error: 0.223295
[2000]	valid_set's multi_error: 0.209659
[3000]	valid_set's multi_error: 0.202273


	0.8011	 = Validation score   (accuracy)
	9.27s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: RandomForestGini_PSEUDO_1 ...
	0.7608	 = Validation score   (accuracy)
	2.65s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestEntr_PSEUDO_1 ...
	0.7625	 = Validation score   (accuracy)
	4.15s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost_PSEUDO_1 ...
	0.7864	 = Validation score   (accuracy)
	197.71s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini_PSEUDO_1 ...
	0.7642	 = Validation score   (accuracy)
	0.98s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: ExtraTreesEntr_PSEUDO_1 ...
	0.7642	 = Validation score   (accuracy)
	0.93s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: XGBoost_PSEUDO_1 ...
	0.7903	 = Validation score   (accuracy)
	8.61s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetTorch_PSEUDO_1 ...
	0.7739	 = Val

[1000]	valid_set's multi_error: 0.215909


	0.7864	 = Validation score   (accuracy)
	9.66s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: WeightedEnsemble_L2_PSEUDO_1 ...
	Ensemble Weights: {'LightGBM_PSEUDO_1': 0.444, 'NeuralNetTorch_PSEUDO_1': 0.333, 'KNeighborsUnif_PSEUDO_1': 0.111, 'ExtraTreesEntr_PSEUDO_1': 0.111}
	0.8108	 = Validation score   (accuracy)
	0.1s	 = Training   runtime
	0.0s	 = Validation runtime
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240603_115558")
Pseudolabeling algorithm changed validation score from: 0.8079545454545455, to: 0.8107954545454545 using evaluation metric: accuracy
Beginning iteration 2 of pseudolabeling out of max 3
Pseudolabeling algorithm confidently assigned pseudolabels to 78 rows of data on iteration 2. Adding to train data
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_PSEUDO_2 ...
	0.7159	 = Validation score   (accuracy)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist

[1000]	valid_set's multi_error: 0.235227
[2000]	valid_set's multi_error: 0.218182
[3000]	valid_set's multi_error: 0.2125
[4000]	valid_set's multi_error: 0.2125
[5000]	valid_set's multi_error: 0.208523
[6000]	valid_set's multi_error: 0.207386
[7000]	valid_set's multi_error: 0.209091


	0.7943	 = Validation score   (accuracy)
	16.21s	 = Training   runtime
	0.52s	 = Validation runtime
Fitting model: LightGBM_PSEUDO_2 ...


[1000]	valid_set's multi_error: 0.224432


	0.7767	 = Validation score   (accuracy)
	3.16s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: RandomForestGini_PSEUDO_2 ...
	0.7625	 = Validation score   (accuracy)
	2.52s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForestEntr_PSEUDO_2 ...
	0.7619	 = Validation score   (accuracy)
	3.78s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: CatBoost_PSEUDO_2 ...
	0.7869	 = Validation score   (accuracy)
	204.36s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini_PSEUDO_2 ...
	0.7631	 = Validation score   (accuracy)
	0.97s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: ExtraTreesEntr_PSEUDO_2 ...
	0.7602	 = Validation score   (accuracy)
	0.7s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost_PSEUDO_2 ...
	0.7994	 = Validation score   (accuracy)
	15.45s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: NeuralNetTorch_PSEUDO_2 ...
	0.7818	 = Val

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2496c7b66d0>

In [27]:
prediction = predictor.predict(test_df)

In [28]:
prediction

id
13467    DEF
12719    DDF
1054     MDF
13747    DDF
9453     DDF
        ... 
115      DDF
10654    MDF
5718     DDF
13054    MDF
6539     DEF
Name: nforest_type, Length: 4000, dtype: object

In [29]:
prediction.value_counts()

nforest_type
MDF    1553
DDF    1505
DEF     942
Name: count, dtype: int64

In [30]:
submission_path = './submissions'
prediction.to_csv(f'{submission_path}/submission_over_features_smote_psuedolabeling.csv')