In [1]:
# AutoGluon Ensemble Pipeline

# This notebook demonstrates how to use AutoGluon to find the best ensemble model for the given dataset.

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor

In [1]:
pwd

'c:\\Users\\Windows 11\\OneDrive\\DOCUMENT\\GitHub\\Forest_Classifaction'

In [2]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Encode the target variable
label_encoder = LabelEncoder()
train_df['nforest_type_encoded'] = label_encoder.fit_transform(train_df['nforest_type'])

# Define features and target
X = train_df.drop(columns=['id', 'nforest_type', 'nforest_type_encoded'])
y = train_df['nforest_type_encoded']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optionally, use PCA for dimensionality reduction
pca = PCA(n_components=10)  # Adjust n_components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the transformed data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Convert to DataFrame for AutoGluon
train_data = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(X_train.shape[1])])
train_data['target'] = y_train

val_data = pd.DataFrame(X_val, columns=[f'feature_{i}' for i in range(X_val.shape[1])])
val_data['target'] = y_val

test_data = pd.DataFrame(pca.transform(scaler.transform(test_df.drop(columns=['id']))), 
                         columns=[f'feature_{i}' for i in range(X_train.shape[1])])
test_data['id'] = test_df['id']

# Check for non-finite values in the target column
print("Non-finite values in target column:", train_data['target'].isna().sum())

# Drop or fill non-finite values if any
train_data = train_data.dropna(subset=['target'])
val_data = val_data.dropna(subset=['target'])

Non-finite values in target column: 2101


train ensemble model:

In [4]:
from autogluon.tabular import TabularPredictor

# Train the model using AutoGluon with presets
predictor = TabularPredictor(label='target', eval_metric='accuracy').fit(train_data, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240603_155946"
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240603_155946\ds_sub_fit\sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 925 second

Evaluate model:

In [5]:
# Evaluate the model on the validation set
val_predictions = predictor.predict(val_data.drop(columns=['target']))
val_accuracy = predictor.evaluate(val_data)['accuracy']
print(f'Validation Accuracy of the AutoGluon Ensemble: {val_accuracy:.4f}')

Validation Accuracy of the AutoGluon Ensemble: 0.4438


In [8]:
# Make predictions on the test set
test_predictions = predictor.predict(test_data.drop(columns=['id']))

# Ensure predictions are integers
test_predictions = test_predictions.astype(int)

# Decode the predictions
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)

In [9]:
# Load your sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('gluon_submission.csv', index=False)