Encoding and Dataset:

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [3]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
# Encode the target variable
label_encoder = LabelEncoder()
train_df['nforest_type_encoded'] = label_encoder.fit_transform(train_df['nforest_type'])

# Define features and target
X = train_df.drop(columns=['id', 'nforest_type', 'nforest_type_encoded'])
y = train_df['nforest_type_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optionally, use PCA for dimensionality reduction
pca = PCA(n_components=10)  # Adjust n_components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the transformed data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Convert Series to numpy array first
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)  # Convert Series to numpy array first

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

Define and Train the Neural Network:

In [6]:
class ImprovedForestNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ImprovedForestNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc3(out)
        return out

# Initialize the neural network, loss function, and optimizer
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = len(label_encoder.classes_)
model = ImprovedForestNN(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Train the neural network
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    scheduler.step()
    
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {running_loss/len(train_loader):.4f}, '
          f'Val Loss: {val_loss/len(val_loader):.4f}, '
          f'Val Accuracy: {100 * correct / total:.2f}%')


Epoch 1/30, Train Loss: 0.9550, Val Loss: 0.8225, Val Accuracy: 64.88%
Epoch 2/30, Train Loss: 0.8731, Val Loss: 0.7839, Val Accuracy: 66.18%
Epoch 3/30, Train Loss: 0.8499, Val Loss: 0.7644, Val Accuracy: 66.83%
Epoch 4/30, Train Loss: 0.8376, Val Loss: 0.7465, Val Accuracy: 67.68%
Epoch 5/30, Train Loss: 0.8226, Val Loss: 0.7352, Val Accuracy: 68.06%
Epoch 6/30, Train Loss: 0.8211, Val Loss: 0.7399, Val Accuracy: 68.02%
Epoch 7/30, Train Loss: 0.8040, Val Loss: 0.7285, Val Accuracy: 68.59%
Epoch 8/30, Train Loss: 0.8105, Val Loss: 0.7277, Val Accuracy: 68.33%
Epoch 9/30, Train Loss: 0.8061, Val Loss: 0.7276, Val Accuracy: 68.29%
Epoch 10/30, Train Loss: 0.8099, Val Loss: 0.7205, Val Accuracy: 68.75%
Epoch 11/30, Train Loss: 0.7953, Val Loss: 0.7198, Val Accuracy: 68.67%
Epoch 12/30, Train Loss: 0.7969, Val Loss: 0.7220, Val Accuracy: 68.79%
Epoch 13/30, Train Loss: 0.7911, Val Loss: 0.7191, Val Accuracy: 68.63%
Epoch 14/30, Train Loss: 0.7884, Val Loss: 0.7163, Val Accuracy: 68.79%
E

Make Prediction:

In [7]:
# Prepare the test data
test_X = test_df.drop(columns=['id'])
test_X_scaled = scaler.transform(test_X)
test_X_pca = pca.transform(test_X_scaled)
test_X_tensor = torch.tensor(test_X_pca, dtype=torch.float32)

# Make predictions
model.eval()
with torch.no_grad():
    test_outputs = model(test_X_tensor)
    _, test_predictions = torch.max(test_outputs.data, 1)

test_predictions_decoded = label_encoder.inverse_transform(test_predictions.numpy())


Inference:

In [8]:
# Load your sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('/submitfile/NN++.csv', index=False)