In [1]:
%pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.30-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)
  Downloading greenlet-3.0.3-cp312-cp312-win_amd64.whl.metadata (3.9 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
   ---------------------------------------- 0.0/380.1 kB ? eta -:--:--
   ------- -------------------------------- 71.7/380.1 kB 2.0 MB/s eta 0:00:01
   --------------------- ------------------ 204.8/380.1 kB 2.5 MB/s eta 0:00:01
   ----------------------- ---------------- 225.3/380.1 kB 2.3 MB/s eta 0:00:01


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna

In [7]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Feature engineering
for df in [train_df, test_df]:
    df['b9'] = df['b9'].fillna(0)
    df['NBR'] = (df['b8'] - df['b12']) / (df['b8'] + df['b12'])
    df['b12'] = df['b12'].fillna(0)
    df['b11'] = df['b11'].fillna(0)
    df['NDMI2'] = (df['b8'] - df['b11']) / (df['b8'] + df['b11'])
    df['b6'] = df['b6'].fillna(0)
    df['NDSI'] = (df['b3'] - df['b11']) / (df['b3'] + df['b11'])
    df['b8_a'] = df['b8_a'].fillna(0)
    df['b8'] = df['b8'].fillna(0)
    df['b5'] = df['b5'].fillna(0)

# Encode the target variable
label_encoder = LabelEncoder()
train_df['nforest_type_encoded'] = label_encoder.fit_transform(train_df['nforest_type'])

# Define features and target
X = train_df.drop(columns=['id', 'nforest_type', 'nforest_type_encoded'])
y = train_df['nforest_type_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optionally, use PCA for dimensionality reduction
pca = PCA(n_components=10)  # Adjust n_components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the transformed data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Data Augmentation using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_smote, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_smote.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [8]:
# Define the neural network
class ForestNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ForestNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.bn3 = nn.BatchNorm1d(hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.bn3(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc4(out)
        return out

# Define the input and output dimensions globally
input_dim = X_train_smote.shape[1]
output_dim = len(label_encoder.classes_)

# Define the objective function for Optuna
def objective(trial):
    hidden_dim = trial.suggest_categorical('hidden_dim', [128, 256, 512])
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    
    model = ForestNN(input_dim, hidden_dim, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    
    # Train the model
    for epoch in range(10):  # use fewer epochs for faster optimization
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
    # Evaluate the model on the validation set
    model.eval()
    val_predictions = []
    with torch.no_grad():
        for inputs in val_loader:
            outputs = model(inputs[0])
            _, predicted = torch.max(outputs.data, 1)
            val_predictions.extend(predicted.numpy())
    
    val_accuracy = accuracy_score(y_val, val_predictions)
    return val_accuracy

# Run the Optuna study to find the best hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print('Best trial:')
trial = study.best_trial
print(f'  Value: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[I 2024-06-07 15:29:00,204] A new study created in memory with name: no-name-5cac83f3-29f7-4ec9-af77-84709e1b5fee
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
[I 2024-06-07 15:30:30,613] Trial 0 finished with value: 0.6775181922635006 and parameters: {'hidden_dim': 256, 'lr': 0.029729278298211863}. Best is trial 0 with value: 0.6775181922635006.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
[I 2024-06-07 15:31:38,470] Trial 1 finished with value: 0.6319417847567982 and parameters: {'hidden_dim': 512, 'lr': 1.964596449739411e-05}. Best is trial 0 with value: 0.6775181922635006.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
[I 2024-06-07 15:32:27,818] Trial 2 finished with value: 0.6162389888931444 and parameters: {'hidden_dim': 256, 'lr': 4.349641382693916e-05}. Best is trial 0 with value: 0.6775181922635006.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
[I 2024-06-07 15:33:06,174] Trial 3 finished with value: 0.6690923018000766 and parameters: {'hidden_dim': 128, 'lr':

Best trial:
  Value: 0.6974339333588664
  Params: 
    hidden_dim: 512
    lr: 0.0060997392775260255


In [9]:
# Train the model with the best hyperparameters
best_hidden_dim = trial.params['hidden_dim']
best_lr = trial.params['lr']

model = ForestNN(input_dim, best_hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=best_lr)

# Train the model with the best hyperparameters
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model on the validation set
model.eval()
val_predictions = []
with torch.no_grad():
    for inputs in val_loader:
        outputs = model(inputs[0])
        _, predicted = torch.max(outputs.data, 1)
        val_predictions.extend(predicted.numpy())

val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')

Validation Accuracy: 0.6917


In [34]:
# Cross-Validation to evaluate the model
scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())

Cross-validation scores:  [0.71590909 0.68110795 0.71946023 0.70205966 0.72753108]
Mean cross-validation score:  0.7092136030599063


In [6]:
# Evaluate the model on the validation set
val_predictions = best_ridge.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')

Validation Accuracy: 0.5948


In [7]:
# Prepare the test data
test_X = test_df.drop(columns=['id'])
test_X_scaled = scaler.transform(test_X)
test_X_pca = pca.transform(test_X_scaled)

# Make predictions on the test set
test_predictions = best_ridge.predict(test_X_pca)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)

In [8]:
# Load your sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('ridge_submission.csv', index=False)
