In [33]:
import pandas as pd 

In [34]:
df_train = pd.read_csv('../data/processed/train.csv')
df_test = pd.read_csv('../data/processed/test.csv')


In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report


# List of categorical columns to encode
categorical_cols = [
    'land_surface_condition', 'foundation_type', 'roof_type',
    'ground_floor_type', 'other_floor_type', 'position',
    'plan_configuration', 'legal_ownership_status'
]

# Initialize OrdinalEncoder with unknown category handling
ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit encoder on train categorical columns and transform both train and test
df_train[categorical_cols] = ord_enc.fit_transform(df_train[categorical_cols])
df_test[categorical_cols] = ord_enc.transform(df_test[categorical_cols])

# Prepare train features and target (drop extra columns)
X_train = df_train.drop(columns=['damage_grade', 'superstructure_sum', 'secondary_use_sum', 'building_id'])
y_train = df_train['damage_grade']

# Prepare test features (drop building_id)
test_ids = df_test['building_id']
X_test = df_test.drop(columns=['building_id'])

# Align train and test features columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]

# Split train into train/validation sets for evaluation
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Train Random Forest classifier with balanced class weights
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_tr, y_tr)

# Predict and evaluate on validation set
y_val_pred = model.predict(X_val)
f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Macro F1 Score: {f1:.4f}\n")
print(classification_report(y_val, y_val_pred))

# Predict on test set
test_preds = model.predict(X_test)

# Save submission CSV
submission = pd.DataFrame({
    'building_id': test_ids,
    'damage_grade': test_preds
})
submission.to_csv('submission.csv', index=False)
print("Submission saved as submission.csv")


Validation Macro F1 Score: 0.6162

              precision    recall  f1-score   support

           1       0.59      0.34      0.43      1848
           2       0.74      0.85      0.79     15116
           3       0.70      0.57      0.63      8059

    accuracy                           0.72     25023
   macro avg       0.68      0.59      0.62     25023
weighted avg       0.72      0.72      0.71     25023

Submission saved as submission.csv
