In [37]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from catboost import CatBoostClassifier


In [38]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Feature engineering
for df in [train_df, test_df]:
    df['b9'] = df['b9'].fillna(0)
    df['NBR'] = (df['b8'] - df['b12']) / (df['b8'] + df['b12'])
    df['b12'] = df['b12'].fillna(0)
    df['b11'] = df['b11'].fillna(0)
    df['NDMI2'] = (df['b8'] - df['b11']) / (df['b8'] + df['b11'])
    df['b6'] = df['b6'].fillna(0)
    df['NDSI'] = (df['b3'] - df['b11']) / (df['b3'] + df['b11'])
    df['b8_a'] = df['b8_a'].fillna(0)
    df['b8'] = df['b8'].fillna(0)
    df['b5'] = df['b5'].fillna(0)
    df["EVI"]  = 2.5*(((df["b8"]-df["b4"])/(df["b8"]+6*df["b6"]-7.5*df["b2"]))+1)
    df["EVI2"] = 2.4 * (df["b8"] - df["b4"]) / (df["b8"] + df["b4"] + 1.0)
    df["GNDVI"] = (df["b8"] - df["b3"]) / (df["b8"] + df["b3"])
    df["NDVI"] = (df["b8"] - df["b4"]) / (df["b8"] + df["b4"])

# Encode the target variable
label_encoder = LabelEncoder()
train_df['nforest_type_encoded'] = label_encoder.fit_transform(train_df['nforest_type'])

# Define features and target
X = train_df.drop(columns=['id', 'nforest_type', 'nforest_type_encoded'])
y = train_df['nforest_type_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optionally, use PCA for dimensionality reduction
pca = PCA(n_components=15)  # Adjust n_components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the transformed data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Convert Series to numpy array first
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)  # Convert Series to numpy array first

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define and Train the Neural Network
#  class ForestNN(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(ForestNN, self).__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         self.bn1 = nn.BatchNorm1d(hidden_dim)
#         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
#         self.bn2 = nn.BatchNorm1d(hidden_dim)
#         self.fc3 = nn.Linear(hidden_dim, hidden_dim)
#         self.bn3 = nn.BatchNorm1d(hidden_dim)
#         self.fc4 = nn.Linear(hidden_dim, output_dim)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.5)
    
#     def forward(self, x):
#         out = self.fc1(x)
#         out = self.bn1(out)
#         out = self.relu(out)
#         out = self.dropout(out)
        
#         out = self.fc2(out)
#         out = self.bn2(out)
#         out = self.relu(out)
#         out = self.dropout(out)
        
#         out = self.fc3(out)
#         out = self.bn3(out)
#         out = self.relu(out)
#         out = self.dropout(out)
        
#         out = self.fc4(out)
#         return out


In [39]:
class ForestNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ForestNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.bn3 = nn.BatchNorm1d(hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, output_dim)  # Ensure this layer outputs the correct number of classes
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.bn3(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc4(out)  # Ensure this layer outputs the correct number of classes
        return out

    
# Initialize the neural network, loss function and optimizer
input_dim = X_train.shape[1]
hidden_dim = 256  # Increase the number of neurons
output_dim = len(label_encoder.classes_)
model = ForestNN(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# Learning rate scheduler
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(train_loader), epochs=50)

# Early stopping parameters
early_stopping_patience = 5
best_val_loss = float('inf')
epochs_no_improve = 0

# Train the neural network
num_epochs = 50
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_losses.append(running_loss / len(train_loader))
    
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_losses.append(val_loss / len(val_loader))
    
    scheduler.step()
    
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_losses[-1]:.4f}, '
          f'Val Loss: {val_losses[-1]:.4f}, '
          f'Val Accuracy: {100 * correct / total:.2f}%')

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve == early_stopping_patience:
        print(f'Early stopping at epoch {epoch+1}')
        break

Epoch 1/50, Train Loss: 0.9916, Val Loss: 0.8379, Val Accuracy: 61.32%
Epoch 2/50, Train Loss: 0.9066, Val Loss: 0.8073, Val Accuracy: 63.81%
Epoch 3/50, Train Loss: 0.8683, Val Loss: 0.7987, Val Accuracy: 65.03%
Epoch 4/50, Train Loss: 0.8532, Val Loss: 0.7534, Val Accuracy: 67.48%
Epoch 5/50, Train Loss: 0.8288, Val Loss: 0.7489, Val Accuracy: 67.14%
Epoch 6/50, Train Loss: 0.8202, Val Loss: 0.7360, Val Accuracy: 68.40%
Epoch 7/50, Train Loss: 0.8138, Val Loss: 0.7243, Val Accuracy: 68.21%
Epoch 8/50, Train Loss: 0.8119, Val Loss: 0.7170, Val Accuracy: 68.94%
Epoch 9/50, Train Loss: 0.8100, Val Loss: 0.7159, Val Accuracy: 68.29%
Epoch 10/50, Train Loss: 0.7995, Val Loss: 0.7146, Val Accuracy: 68.33%
Epoch 11/50, Train Loss: 0.7959, Val Loss: 0.7188, Val Accuracy: 68.59%
Epoch 12/50, Train Loss: 0.7922, Val Loss: 0.7084, Val Accuracy: 69.32%
Epoch 13/50, Train Loss: 0.7861, Val Loss: 0.7071, Val Accuracy: 68.63%
Epoch 14/50, Train Loss: 0.7880, Val Loss: 0.7013, Val Accuracy: 69.48%
E

In [40]:
# # Define a Wrapper for the Neural Network
# class NeuralNetworkWrapper(BaseEstimator, ClassifierMixin):
#     def __init__(self, nn_model):
#         self.nn_model = nn_model

#     def fit(self, X, y):
#         # Neural network is already trained
#         pass

#     def predict_proba(self, X):
#         self.nn_model.eval()
#         with torch.no_grad():
#             outputs = self.nn_model(torch.tensor(X, dtype=torch.float32))
#             probabilities = torch.softmax(outputs, dim=1)
#             return probabilities.numpy()

#     def predict(self, X):
#         probabilities = self.predict_proba(X)
#         return probabilities.argmax(axis=1)
    
    
# class RegressionWrapper(BaseEstimator, ClassifierMixin):
#     def __init__(self, regression_model, classes):
#         self.regression_model = regression_model
#         self.classes = classes

#     def fit(self, X, y):
#         self.regression_model.fit(X, y)

#     def predict(self, X):
#         predictions = self.regression_model.predict(X)
#         return np.round(predictions).astype(int)

#     def predict_proba(self, X):
#         predictions = self.regression_model.predict(X)
#         proba = np.zeros((predictions.shape[0], len(self.classes)))
#         for i, pred in enumerate(predictions):
#             closest_class = np.argmin(np.abs(self.classes - pred))
#             proba[i, closest_class] = 1.0
#         return proba
    
# # Define a Wrapper for the Regression Model
# class RegressionWrapper(BaseEstimator, ClassifierMixin):
#     def __init__(self, regression_model, classes):
#         self.regression_model = regression_model
#         self.classes = classes

#     def fit(self, X, y):
#         self.regression_model.fit(X, y)

#     def predict(self, X):
#         predictions = self.regression_model.predict(X)
#         return np.round(predictions).astype(int)

#     def predict_proba(self, X):
#         predictions = self.regression_model.predict(X)
#         proba = np.zeros((predictions.shape[0], len(self.classes)))
#         for i, pred in enumerate(predictions):
#             closest_class = np.argmin(np.abs(self.classes - pred))
#             proba[i, closest_class] = 1.0
#         return proba

In [41]:
# Define a Wrapper for the Neural Network
class NeuralNetworkWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, nn_model):
        self.nn_model = nn_model

    def fit(self, X, y):
        # Neural network is already trained
        pass

    def predict_proba(self, X):
        self.nn_model.eval()
        with torch.no_grad():
            outputs = self.nn_model(torch.tensor(X, dtype=torch.float32))
            probabilities = torch.softmax(outputs, dim=1)
            return probabilities.numpy()

    def predict(self, X):
        probabilities = self.predict_proba(X)
        return probabilities.argmax(axis=1)

# Define Custom Wrappers for Non-Probabilistic Models
class RidgeClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, regression_model, classes):
        self.regression_model = regression_model
        self.classes = classes

    def fit(self, X, y):
        self.regression_model.fit(X, y)

    def predict(self, X):
        predictions = self.regression_model.predict(X)
        return np.round(predictions).astype(int)

    def predict_proba(self, X):
        predictions = self.regression_model.predict(X)
        proba = np.zeros((predictions.shape[0], len(self.classes)))
        for i, pred in enumerate(predictions):
            closest_class = np.argmin(np.abs(self.classes - pred))
            proba[i, closest_class] = 1.0
        return proba

In [42]:
catboost_model = CatBoostClassifier(
    iterations=2500,  # เพิ่มจำนวน iterations เพื่อให้ได้ประสิทธิภาพสูงสุด
    learning_rate=0.0537,  # ลด learning rate เพื่อการปรับแต่งโมเดลที่ดีขึ้น
    depth=9,  # เพิ่ม depth เพื่อให้โมเดลสามารถจับความสัมพันธ์ที่ซับซ้อนได้
    eval_metric='Accuracy',  # ใช้ metric การประเมินผลเป็น accuracy
    random_seed=42,
    l2_leaf_reg= 0.0534,
    bootstrap_type= 'MVS',
    verbose=500  # ตั้งค่า verbosity ให้แสดงผลทุก 1000 iterations
)

catboost_model2 = CatBoostClassifier(
    iterations=2500,  # เพิ่มจำนวน iterations เพื่อให้ได้ประสิทธิภาพสูงสุด
    learning_rate=0.1,  # ลด learning rate เพื่อการปรับแต่งโมเดลที่ดีขึ้น
    depth=8,  # เพิ่ม depth เพื่อให้โมเดลสามารถจับความสัมพันธ์ที่ซับซ้อนได้
    eval_metric='Accuracy',  # ใช้ metric การประเมินผลเป็น accuracy
    random_seed=42,
    l2_leaf_reg= 0.0534,
    bootstrap_type= 'MVS',
    verbose=500  # ตั้งค่า verbosity ให้แสดงผลทุก 1000 iterations
)

catboost_model3 = CatBoostClassifier(
    iterations=2500,  # เพิ่มจำนวน iterations เพื่อให้ได้ประสิทธิภาพสูงสุด
    learning_rate=0.0537,  # ลด learning rate เพื่อการปรับแต่งโมเดลที่ดีขึ้น
    depth=7,  # เพิ่ม depth เพื่อให้โมเดลสามารถจับความสัมพันธ์ที่ซับซ้อนได้
    eval_metric='Accuracy',  # ใช้ metric การประเมินผลเป็น accuracy
    random_seed=42,
    l2_leaf_reg= 0.0534,
    bootstrap_type= 'MVS',
    verbose=500  # ตั้งค่า verbosity ให้แสดงผลทุก 1000 iterations
)

In [43]:
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Train XGBoost
xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)

# Train LightGBM
lgb = LGBMClassifier(n_estimators=100, random_state=42)
lgb.fit(X_train, y_train)

# Train SVM
# svm = SVC(probability=True, random_state=42)
# svm.fit(X_train, y_train)

# Train Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Wrap the neural network and regression model
nn_wrapper = NeuralNetworkWrapper(model)
# ridge_wrapper = RegressionWrapper(ridge, classes=np.arange(len(label_encoder.classes_)))
ridge_wrapper = RidgeClassifierWrapper(ridge, classes=np.arange(len(label_encoder.classes_)))


# Fit the 3-cats models
catboost_model.fit(X_train, y_train)
catboost_model2.fit(X_train, y_train)
catboost_model3.fit(X_train, y_train)

# Create an ensemble model
ensemble = VotingClassifier(estimators=[
    ('rf', rf), 
    ('xgb', xgb), 
    ('lgb', lgb), 
    ('nn', nn_wrapper), 
    ('ridge', ridge_wrapper),
    ('catboost1', catboost_model),
    ('catboost2', catboost_model2),
    ('catboost3', catboost_model3)
], voting='soft')

# Fit the ensemble
ensemble.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002552 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 10442, number of used features: 14
[LightGBM] [Info] Start training from score -1.044827
[LightGBM] [Info] Start training from score -1.615393
[LightGBM] [Info] Start training from score -0.799764
0:	learn: 0.5898295	total: 107ms	remaining: 4m 27s
500:	learn: 0.9819000	total: 35.2s	remaining: 2m 20s
1000:	learn: 1.0000000	total: 1m 10s	remaining: 1m 45s
1500:	learn: 1.0000000	total: 1m 46s	remaining: 1m 10s
2000:	learn: 1.0000000	total: 2m 17s	remaining: 34.3s
2499:	learn: 1.0000000	total: 2m 49s	remaining: 0us
0:	learn: 0.5812105	total: 115ms	remaining: 4m 47s
500:	learn: 0.9917640	total: 17.8s	remaining: 1m 11s
1000:	learn: 1.0000000	total: 34.1s	remaining: 51.1s
1500:	le

In [44]:
# Debugging function to check predict_proba outputs
def check_predict_proba_shapes(models, X):
    for name, model in models:
        proba = model.predict_proba(X)
        print(f"{name}: {proba.shape}")

# Check shapes of predict_proba outputs for each model
check_predict_proba_shapes(ensemble.estimators, X_val)

rf: (2611, 3)
xgb: (2611, 3)
lgb: (2611, 3)
nn: (2611, 3)
ridge: (2611, 3)
catboost1: (2611, 3)
catboost2: (2611, 3)
catboost3: (2611, 3)


13 fea: 0.7204
14 fea: 0.7296
15 fea: 0.7193

In [47]:
# Evaluate the ensemble on the validation set
val_predictions = ensemble.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy of the Ensemble: {val_accuracy:.4f}')

Validation Accuracy of the Ensemble: 0.7158


In [48]:
# Prepare the test data
test_X = test_df.drop(columns=['id'])
test_X_scaled = scaler.transform(test_X)
test_X_pca = pca.transform(test_X_scaled)

# Make predictions with the ensemble
test_predictions = ensemble.predict(test_X_pca)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)

# Load your sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('3cats_ensem+regress+fea14_submission.csv', index=False)
