# SuperAI Season 4 - Level 2 Hackathon - Forest Type Double Classifier

## Fix Dataset

In [37]:
import numpy as np
import pandas as pd

In [38]:
def add_features(row) :

    row['NDVI'] = (row['b8'] - row['b4']) / (row['b8'] + row['b4'])
    row['EVI'] = 2.5 * ((row['b8'] - row['b4']) / (row['b8'] + 6 * row['b4'] - 7.5 * row['b2'] + 1.01))
    row['NDWI '] = (row['b3'] - row['b8']) / (row['b3'] + row['b8'])
    row['SAVI '] = (row['b8'] - row['b4']) * (1 + 0.5) / (row['b8'] + row['b4'] + 0.5)
    row['MSAVI'] = (2 * row['b8'] + 1 - ( (2 * row['b8'] + 1) ** 2 - 8 * (row['b8'] - row['b4'])) ** (1 / 2)) / 2
    row['GNDVI '] = (row['b8'] - row['b3']) / (row['b8'] + row['b3'])
    row['RENDVI '] = (row['b8'] - row['b5']) / (row['b8'] + row['b5'])
    row['NDMI '] = (row['b8'] - row['b11']) / (row['b8'] + row['b11'])
    row['GRVI'] = (row['b3'] - row['b4']) / (row['b3'] + row['b4'])
    row['TVI'] = ( (row['b8'] - row['b4']) / (row['b8'] + row['b4'] + 0.5) ) ** (1 / 2)
    row['MCARI'] = ((row['b5'] - row['b4']) - 0.2 * (row['b5'] - row['b3'])) / (row['b5'] / row['b4'])
    row['BSI'] =  ((row['b11'] + row['b4']) - (row['b8'] + row['b2'])) / ((row['b11'] + row['b4']) + (row['b8'] + row['b2']))
    row['NBR'] = (row['b8'] - row['b12']) / (row['b8'] + row['b12'])
    row['MSI'] = row['b11'] / row['b8']
    row['RVI'] = row['b8'] / row['b4']
    row['GCI'] = (row['b8'] / row['b3']) - 1

    
    return row

In [39]:
from imblearn.over_sampling import SMOTE 

smote = SMOTE(random_state = 42 , sampling_strategy= 'all')

DEF_NONDEF_df = pd.read_csv('./datasets/DEF_NONDEF.csv' , index_col='id' )
DEF_NONDEF_df , label_df  = smote.fit_resample(DEF_NONDEF_df.drop(columns=['nforest_type']) , DEF_NONDEF_df['nforest_type'])
DEF_NONDEF_df = DEF_NONDEF_df.join(label_df)

DEF_NONDEF_df.sort_index().to_csv('./datasets/fix_DEF_NONDEF.csv' , index_label='id')

MDF_DDF_df = pd.read_csv('./datasets/MDF_DDF.csv' , index_col='id')
MDF_DDF_df , label_df  = smote.fit_resample(MDF_DDF_df.drop(columns=['nforest_type']) , MDF_DDF_df['nforest_type'])
MDF_DDF_df = MDF_DDF_df.join(label_df)

MDF_DDF_df.sort_index().to_csv('./datasets/fix_MDF_DDF.csv' , index_label='id')

In [40]:
DEF_NONDEF_df

Unnamed: 0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
0,289,1488,771,295,418,300,666,1579,1896,1937,2143,2368,NON_DEF
1,738,2281,1240,764,912,841,1254,2048,2376,2299,2792,2686,NON_DEF
2,884,2256,1491,807,856,824,1127,1590,1834,1692,2021,2074,NON_DEF
3,380,2232,1276,465,753,666,1286,2387,2723,2651,3045,3084,NON_DEF
4,278,1494,694,261,401,303,671,1852,2162,2284,2463,2305,NON_DEF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20931,460,1879,886,427,645,455,976,2413,2958,3095,3367,2823,DEF
20932,200,1475,637,193,402,217,627,2191,2841,2945,2839,2884,DEF
20933,130,1421,546,164,384,218,561,2073,2556,3380,3017,2875,DEF
20934,348,1783,843,386,610,351,984,2597,3170,3251,3422,3264,DEF


In [41]:
MDF_DDF_df

Unnamed: 0,b1,b11,b12,b2,b3,b4,b5,b6,b7,b8,b8_a,b9,nforest_type
0,293,1927,1038,278,475,453,987,1773,2184,1900,2343,3039,MDF
1,197,1598,697,201,347,228,682,1982,2449,2254,2685,2690,DDF
2,929,1975,1031,982,1020,856,1220,2051,2421,2392,2671,2683,MDF
3,132,1560,689,189,408,175,609,2117,2907,3024,3005,2955,MDF
4,241,1944,1131,362,538,487,918,1549,1844,1702,2077,2043,MDF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11725,170,1840,928,198,456,311,773,2016,2439,2511,2739,2905,DDF
11726,126,1963,881,181,400,187,703,2577,3414,3602,3768,3373,DDF
11727,298,1302,1018,324,515,557,773,879,975,1171,1116,1151,DDF
11728,589,1816,947,594,714,553,982,1820,2135,2190,2444,2519,DDF


## DEF_NONDEF Classifier

In [42]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader , random_split
import lightning as L
from torch.utils.data import Dataset
import pandas as pd
import torchmetrics


In [43]:
class DEF_NONDEF_Dataset (Dataset) :

    def __init__ (self , annotation_path , transform = None , target_transform = None) :

        self.annotation_path = annotation_path
        self.annotation_file = pd.read_csv(annotation_path , index_col = 'id')
        self.annotation_arrays = self.annotation_file.drop(columns=['nforest_type']).to_numpy().astype('float32') 
        self.annotation_labels = (self.annotation_file['nforest_type']  == 'DEF').to_numpy().astype('float32')
        
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self) :

        return len(self.annotation_file)

    def __getitem__ (self , idx) :

        data  = self.annotation_arrays[idx]
        label = self.annotation_labels[idx]
        
        if self.transform :

            data = self.transform(data)

        if self.target_transform :

            label = self.target_transform(label)

        return data, label

In [44]:
DEF_NONDEF_dataset = DEF_NONDEF_Dataset('./datasets/fix_DEF_NONDEF.csv')

In [45]:
# Define the size of each split
train_size = int(0.8 * len(DEF_NONDEF_dataset))
test_size = len(DEF_NONDEF_dataset) - train_size

# Split the dataset
train_dataset , test_dataset = random_split(DEF_NONDEF_dataset, [train_size , test_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [46]:
for x , y in train_loader :
    
    print(x.shape , y.shape)

torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])
torch.Size([128, 12]) torch.Size([128])


In [47]:
class Classifier(nn.Module):
    
    def __init__(self ,num_features):
        
        super().__init__()
        
        self.fc1 = nn.Linear(num_features, 256)  # Increased neurons
        self.batch_norm1 = nn.BatchNorm1d(256) 
        self.dropout1 = nn.Dropout(p = 0.25)     # Increased dropout
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size = 5 , stride=1, padding=2)
        
        self.fc2 = nn.Linear(256, 256)       # Increased neurons
        self.batch_norm2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(p= 0.25)     # Increased dropout

        self.fc3 = nn.Linear(256, 256)       # Added another layer
        self.batch_norm3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(p= 0.25)
        
        self.fc4 = nn.Linear(256, 256)       # Added another layer
        self.batch_norm4 = nn.BatchNorm1d(256)
        self.dropout4 = nn.Dropout(p= 0.25)
        
        self.fc5 = nn.Linear(256, 256)       # Added another layer
        self.batch_norm5 = nn.BatchNorm1d(256)
        self.dropout5 = nn.Dropout(p= 0.25)
        
        self.fc6 = nn.Linear(256, 256)       # Added another layer
        self.batch_norm6 = nn.BatchNorm1d(256)
        self.dropout6 = nn.Dropout(p = 0.25)
        
        self.output = nn.Linear(256, 1)

    def forward(self, x):
        
        
        x = F.relu(self.fc1(x))  # Changed to relu
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.conv1(x.unsqueeze(1))
        x = x.squeeze()

        r = self.conv1(x.unsqueeze(1))
        r = r.squeeze()
        
        x = F.relu(self.fc2(x))  # Changed to relu
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.conv1(x.unsqueeze(1))
        x = x.squeeze()
        
        x = x + r

        r = self.conv1(x.unsqueeze(1))
        r = r.squeeze()
        
        x = F.relu(self.fc3(x))  # Changed to relu
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.conv1(x.unsqueeze(1))
        x = x.squeeze()
        
        x = x + r
        
        x = F.relu(self.fc4(x))  # Changed to relu
        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = self.conv1(x.unsqueeze(1))
        x = x.squeeze()
        
        r = self.conv1(x.unsqueeze(1))
        r = r.squeeze()
        
        x = F.relu(self.fc5(x))  # Changed to relu
        x = self.batch_norm5(x)
        x = self.dropout5(x)
        x = self.conv1(x.unsqueeze(1))
        x = x.squeeze()
        
        x = x + r
        
        r = self.conv1(x.unsqueeze(1))
        r = r.squeeze()
        
        x = F.relu(self.fc6(x))  # Changed to relu
        x = self.batch_norm6(x)
        x = self.dropout6(x)
        x = self.conv1(x.unsqueeze(1))
        x = x.squeeze()
        
        x = x + r
        
        x = self.output(x)
        x = torch.sigmoid(x)
        
        return x

In [48]:
class DEF_NONDEF_Classifier(L.LightningModule):
    
    def __init__(self , num_features):
        
        super().__init__()
        
        self.Classifier = Classifier(num_features)
        
        self.train_accuracy = torchmetrics.Accuracy(task='binary')
        self.val_accuracy = torchmetrics.Accuracy(task='binary')

    def forward (self , x) :
        
        return self.Classifier(x)
    
    def training_step(self, batch, batch_idx):
        
        x, y = batch
        y_hat = self.forward(x)
        
        loss = F.binary_cross_entropy_with_logits(y_hat.squeeze() , y)
        acc = self.train_accuracy(y_hat.squeeze(), y)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        
        return loss
    
    def test_step(self, batch, batch_idx):
        
        x, y = batch
        y_hat = self.forward(x)

        
        test_loss = F.binary_cross_entropy_with_logits(y_hat.squeeze() , y)
        acc = self.train_accuracy(y_hat.squeeze(), y)
        
        self.log('test_loss', test_loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('test_acc', acc, on_step=True, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-4)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        
        return [optimizer], [scheduler]

In [49]:
DEF_NONDEF_Classifier_model = DEF_NONDEF_Classifier(12)

In [50]:
print(DEF_NONDEF_Classifier_model)

DEF_NONDEF_Classifier(
  (Classifier): Classifier(
    (fc1): Linear(in_features=12, out_features=256, bias=True)
    (batch_norm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dropout1): Dropout(p=0.25, inplace=False)
    (conv1): Conv1d(1, 1, kernel_size=(5,), stride=(1,), padding=(2,))
    (fc2): Linear(in_features=256, out_features=256, bias=True)
    (batch_norm2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dropout2): Dropout(p=0.25, inplace=False)
    (fc3): Linear(in_features=256, out_features=256, bias=True)
    (batch_norm3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dropout3): Dropout(p=0.25, inplace=False)
    (fc4): Linear(in_features=256, out_features=256, bias=True)
    (batch_norm4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dropout4): Dropout(p=0.25, inplace=False)
    (fc5): Linear(in_features=256, 

In [51]:
trainer = L.Trainer(max_epochs = 100)
trainer.fit(model = DEF_NONDEF_Classifier_model, train_dataloaders = train_loader )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name           | Type           | Params
--------------------------------------------------
0 | Classifier     | Classifier     | 335 K 
1 | train_accuracy | BinaryAccuracy | 0     
2 | val_accuracy   | BinaryAccuracy | 0     
--------------------------------------------------
335 K     Trainable params
0         Non-trainable params
335 K     Total params
1.342     Total estimated model params size (MB)
c:\users\teehe\appdata\roaming\python\python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [52]:
trainer.test(DEF_NONDEF_Classifier_model , dataloaders = test_loader)

c:\users\teehe\appdata\roaming\python\python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss_epoch': 0.5730642676353455, 'test_acc_epoch': 0.8517192006111145}]

## MDF DDF Classifier

In [53]:
class MDF_DDF_Dataset (Dataset) :

    def __init__ (self , annotation_path , transform = None , target_transform = None) :

        self.annotation_path = annotation_path
        self.annotation_file = pd.read_csv(annotation_path , index_col = 'id')
        self.annotation_arrays = self.annotation_file.drop(columns=['nforest_type']).to_numpy().astype('float32')
        self.annotation_labels = (self.annotation_file['nforest_type']  == 'MDF').to_numpy().astype('float32')
        
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self) :

        return len(self.annotation_file)

    def __getitem__ (self , idx) :

        data  = self.annotation_arrays[idx]
        label = self.annotation_labels[idx]
        
        if self.transform :

            data = self.transform(data)

        if self.target_transform :

            label = self.target_transform(label)

        return data , label

In [54]:
MDF_DDF_dataset = MDF_DDF_Dataset('./datasets/fix_MDF_DDF.csv')


In [55]:
# Define the size of each split
train_size = int(0.8 * len(MDF_DDF_dataset))
test_size = len(MDF_DDF_dataset) - train_size

# Split the dataset
train_dataset , test_dataset = random_split(MDF_DDF_dataset, [train_size , test_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [56]:
from typing import Any


class MDF_DDF_Classifier(L.LightningModule):
    
    def __init__(self , num_features):
        
        super().__init__()
        
        self.Classifier = Classifier(num_features)
        
        self.train_accuracy = torchmetrics.Accuracy(task='binary')
        self.val_accuracy = torchmetrics.Accuracy(task='binary')

    def forward(self , x) :
    
        return self.Classifier(x)
    
    def training_step(self, batch, batch_idx):
        
        x, y = batch
        y_hat = self.forward(x)

        
        loss = F.binary_cross_entropy_with_logits(y_hat.squeeze() , y)
        acc = self.train_accuracy(y_hat.squeeze(), y)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        
        return loss
    
    def test_step(self, batch, batch_idx):
        
        x, y = batch
        y_hat = self.forward(x)
        
        test_loss = F.binary_cross_entropy_with_logits(y_hat.squeeze() , y)
        acc = self.train_accuracy(y_hat.squeeze(), y)
        
        self.log('test_loss', test_loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('test_acc', acc, on_step=True, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr= 1e-4)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        return [optimizer], [scheduler]

In [57]:
MDF_DDF_Classifier_model = MDF_DDF_Classifier(12)

In [58]:
trainer = L.Trainer(max_epochs=100)
trainer.fit(model = MDF_DDF_Classifier_model, train_dataloaders = train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name           | Type           | Params
--------------------------------------------------
0 | Classifier     | Classifier     | 335 K 
1 | train_accuracy | BinaryAccuracy | 0     
2 | val_accuracy   | BinaryAccuracy | 0     
--------------------------------------------------
335 K     Trainable params
0         Non-trainable params
335 K     Total params
1.342     Total estimated model params size (MB)
c:\users\teehe\appdata\roaming\python\python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [59]:
trainer.test(MDF_DDF_Classifier_model , dataloaders = test_loader)

c:\users\teehe\appdata\roaming\python\python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss_epoch': 0.6371230483055115, 'test_acc_epoch': 0.6794543862342834}]