In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression, RidgeClassifier
from sklearn.metrics import accuracy_score

In [18]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Feature engineering
for df in [train_df, test_df]:
    df['b9'] = df['b9'].fillna(0)
    df['NBR'] = (df['b8'] - df['b12']) / (df['b8'] + df['b12'])
    df['b12'] = df['b12'].fillna(0)
    df['b11'] = df['b11'].fillna(0)
    df['NDMI2'] = (df['b8'] - df['b11']) / (df['b8'] + df['b11'])
    df['b6'] = df['b6'].fillna(0)
    df['NDSI'] = (df['b3'] - df['b11']) / (df['b3'] + df['b11'])
    df['b8_a'] = df['b8_a'].fillna(0)
    df['b8'] = df['b8'].fillna(0)
    df['b5'] = df['b5'].fillna(0)
    df["EVI"]  = 2.5*(((df["b8"]-df["b4"])/(df["b8"]+6*df["b6"]-7.5*df["b2"]))+1)
    df["EVI2"] = 2.4 * (df["b8"] - df["b4"]) / (df["b8"] + df["b4"] + 1.0)
    df["GNDVI"] = (df["b8"] - df["b3"]) / (df["b8"] + df["b3"])
    df["NDVI"] = (df["b8"] - df["b4"]) / (df["b8"] + df["b4"])

# Encode the target variable
label_encoder = LabelEncoder()
train_df['nforest_type_encoded'] = label_encoder.fit_transform(train_df['nforest_type'])

# Define features and target
X = train_df.drop(columns=['id', 'nforest_type', 'nforest_type_encoded'])
y = train_df['nforest_type_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optionally, use PCA for dimensionality reduction
pca = PCA(n_components=10)  # Adjust n_components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the transformed data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [19]:
# Data Augmentation using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [29]:
# Define classification models
models = {
    'Ridge': RidgeClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'SVC': SVC(probability=True)
}

# Define hyperparameter grids for each model
param_grids = {
    'Ridge': {'alpha': [0.1, 1.0, 10.0, 100.0]},
    'LogisticRegression': {'C': [0.01, 0.1, 1, 10]},
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'GradientBoosting': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

In [30]:
# Function to perform hyperparameter tuning and cross-validation
def tune_and_evaluate(models, param_grids, X_train, y_train):
    best_models = {}
    for model_name, model in models.items():
        print(f"Processing {model_name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=3, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_models[model_name] = best_model
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
        print(f"Cross-validation scores for {model_name}: {scores}")
        print(f"Mean cross-validation score for {model_name}: {scores.mean()}\n")
    return best_models

In [31]:
# Perform tuning and evaluation
best_models = tune_and_evaluate(models, param_grids, X_train_smote, y_train_smote)


Processing Ridge...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters for Ridge: {'alpha': 100.0}
Cross-validation scores for Ridge: [0.61612216 0.58806818 0.60901989 0.60014205 0.60106572]
Mean cross-validation score for Ridge: 0.6028835984175682

Processing LogisticRegression...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters for LogisticRegression: {'C': 0.1}
Cross-validation scores for LogisticRegression: [0.63139205 0.5859375  0.61612216 0.59872159 0.61172291]
Mean cross-validation score for LogisticRegression: 0.6087792416841595

Processing RandomForest...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters for RandomForest: {'max_depth': None, 'n_estimators': 200}
Cross-validation scores for RandomForest: [0.7631392  0.73828125 0.76598011 0.76455966 0.84262877]
Mean cross-validation score for RandomForest: 0.7749178003390925

Processing GradientBoosting...
Fitting 3 folds for each of 8 candidates, tota

In [33]:
# Compare models based on cross-validation scores
for model_name, model in best_models.items():
    val_predictions = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_predictions)
    print(f'Validation Accuracy for {model_name}: {val_accuracy:.4f}')

Validation Accuracy for Ridge: 0.5948
Validation Accuracy for LogisticRegression: 0.6063
Validation Accuracy for RandomForest: 0.7001
Validation Accuracy for GradientBoosting: 0.6821
Validation Accuracy for SVC: 0.7078


In [4]:
ridge = RidgeClassifier()
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_smote, y_train_smote)

best_ridge = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters found:  {'alpha': 100.0}


In [34]:
# Cross-Validation to evaluate the model
scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring='accuracy')
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())

Cross-validation scores:  [0.71590909 0.68110795 0.71946023 0.70205966 0.72753108]
Mean cross-validation score:  0.7092136030599063


In [6]:
# Evaluate the model on the validation set
val_predictions = best_ridge.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')

Validation Accuracy: 0.5948


In [7]:
# Prepare the test data
test_X = test_df.drop(columns=['id'])
test_X_scaled = scaler.transform(test_X)
test_X_pca = pca.transform(test_X_scaled)

# Make predictions on the test set
test_predictions = best_ridge.predict(test_X_pca)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)

In [8]:
# Load your sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('ridge_submission.csv', index=False)
