Encode and Loading Dataset:

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [2]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Encode the target variable
label_encoder = LabelEncoder()
train_df['nforest_type_encoded'] = label_encoder.fit_transform(train_df['nforest_type'])

# Split the data into training and validation sets
X = train_df.drop(columns=['id', 'nforest_type', 'nforest_type_encoded'])
y = train_df['nforest_type_encoded']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Train classification model:

In [3]:
# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print('Validation Accuracy:', accuracy_score(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.66      0.63      0.65       930
           1       0.79      0.66      0.72       509
           2       0.68      0.76      0.72      1172

    accuracy                           0.69      2611
   macro avg       0.71      0.68      0.69      2611
weighted avg       0.70      0.69      0.69      2611

Validation Accuracy: 0.6928379931060896


Make prediction:

In [5]:
# Prepare the test data
test_X = test_df.drop(columns=['id'])

# Make predictions
test_predictions = model.predict(test_X)

# Decode the predictions
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)


Inference to submission:

In [6]:
# Load your sample submission file
sample_submission = pd.read_csv('sample_submission.csv')

# Merge the test data with sample submission to fill in the predicted values
predictions_df = pd.DataFrame({'id': test_df['id'], 'nforest_type': test_predictions_decoded})
final_submission = sample_submission.merge(predictions_df, on='id', how='left', suffixes=('', '_predicted'))

# Fill the missing values in sample submission with the predicted values
final_submission['nforest_type'] = final_submission['nforest_type'].combine_first(final_submission['nforest_type_predicted'])

# Drop the predicted column as it's no longer needed
final_submission = final_submission.drop(columns=['nforest_type_predicted'])

# Save the final submission
final_submission.to_csv('submitfile/final_submission.csv', index=False)
