# GBM, RF, KNN Models 

This notebook develops and evaluates three machine learning models—**Gradient Boosting Machine (GBM)**, **Random Forest (RF)**, and **K-Nearest Neighbors (KNN)**—using the preprocessed `Updated_Metadata_2.csv` dataset.

### Key Steps:

1. **Data Preparation**
   - Loads training and testing data.
   - Encodes the `activity` target using label encoding.
   - Standardizes features using `StandardScaler`.

2. **Model Training and Hyperparameter Tuning**
   - Uses `GridSearchCV` with 5-fold cross-validation to find the best hyperparameters for each model.
   - Scoring is based on **balanced accuracy** to account for class imbalance.

3. **Evaluation**
   - Each model is evaluated on the test set using:
     - **Balanced accuracy**
     - **Classification report**
     - **Test accuracy scores**


## Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score, average_precision_score


## Data Preprocessing

In [2]:
# Load dataset
training_df = pd.read_csv("Updated_Metadata_2.csv")
testing_df = pd.read_csv("Updated_Metadata_Test_2.csv")

# Encode the target variable
target_column = "activity"
label_encoder = LabelEncoder()
training_df[target_column] = label_encoder.fit_transform(training_df[target_column])
testing_df[target_column] = label_encoder.fit_transform(testing_df[target_column])

# Separate features and target
X_train = training_df.drop(columns=["user_snippet", target_column])  # Exclude non-numeric and target columns
y_train = training_df[target_column]

X_test = testing_df.drop(columns=["user_snippet", target_column])  # Exclude non-numeric and target columns
y_test = testing_df[target_column]

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Gradient Boosting Machine (GBM)

In [3]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier()
param_grid_gbm = {
    'n_estimators': [100, 200],  # Number of boosting stages
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'max_depth': [5, 7],  # Tree depth
    'min_samples_split': [5, 10],  # Minimum samples required to split
    'min_samples_leaf': [2, 4],  # Minimum samples in leaf node
    'max_features': ['log2']  # Features considered per split
}

grid_search_gbm = GridSearchCV(gbm, param_grid_gbm, cv=5, scoring='balanced_accuracy', n_jobs=-1)
grid_search_gbm.fit(X_train, y_train)

best_gbm = grid_search_gbm.best_estimator_

In [4]:
y_test_pred_gbm = best_gbm.predict(X_test)
classification_rep_gbm = classification_report(y_test, y_test_pred_gbm, target_names=label_encoder.classes_)
balanced_accuracy_gbm = balanced_accuracy_score(y_test, y_test_pred_gbm)

print("Best Hyperparameters for GBM:", grid_search_gbm.best_params_)
print("GBM Test Balanced Accuracy:", balanced_accuracy_gbm)
print("Classification Report:\n", classification_rep_gbm)

Best Hyperparameters for GBM: {'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
GBM Test Balanced Accuracy: 0.7832709638343851
Classification Report:
               precision    recall  f1-score   support

  Downstairs       0.62      0.52      0.57       174
     Jogging       0.95      0.98      0.96       689
     Sitting       0.65      1.00      0.79        22
    Standing       1.00      0.72      0.84        43
    Upstairs       0.78      0.53      0.63       238
     Walking       0.86      0.95      0.90       768

    accuracy                           0.86      1934
   macro avg       0.81      0.78      0.78      1934
weighted avg       0.86      0.86      0.86      1934



## Random Forest (RF)

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param_grid_rf = {
    'n_estimators': [100, 200],  # Number of trees
    'max_depth': [10, 20],  # Maximum depth of trees
    'min_samples_split': [5, 10],  # Minimum samples required to split
    'min_samples_leaf': [2, 4],  # Minimum samples in leaf node
    'max_features': ['log2'],  # Features considered per split
    'bootstrap': [True]  # Bootstrap sampling
}

grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='balanced_accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

best_rf = grid_search_rf.best_estimator_

In [6]:
y_test_pred_rf = best_rf.predict(X_test)
classification_rep_rf = classification_report(y_test, y_test_pred_rf, target_names=label_encoder.classes_)
balanced_accuracy_rf = balanced_accuracy_score(y_test, y_test_pred_rf)

print("Best Hyperparameters for RF:", grid_search_rf.best_params_)
print("RF Test Balanced Accuracy:", balanced_accuracy_rf)
print("Classification Report:\n", classification_rep_rf)

Best Hyperparameters for RF: {'bootstrap': True, 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
RF Test Balanced Accuracy: 0.7425660351679381
Classification Report:
               precision    recall  f1-score   support

  Downstairs       0.51      0.34      0.41       174
     Jogging       0.95      0.98      0.97       689
     Sitting       0.61      1.00      0.76        22
    Standing       1.00      0.67      0.81        43
    Upstairs       0.77      0.51      0.61       238
     Walking       0.82      0.95      0.88       768

    accuracy                           0.85      1934
   macro avg       0.78      0.74      0.74      1934
weighted avg       0.84      0.85      0.83      1934



## K Nearest Neighbors (KNN)

In [7]:
from sklearn.neighbors import KNeighborsClassifier

# K-Nearest Neighbors with hyperparameter tuning
knn = KNeighborsClassifier()
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weighting method
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
}

# GridSearchCV for hyperparameter tuning
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='balanced_accuracy', n_jobs=-1)
grid_search_knn.fit(X_train, y_train)

# Best KNN model
best_knn = grid_search_knn.best_estimator_

In [8]:
# Model evaluation on test data
y_test_pred_knn = best_knn.predict(X_test)
classification_rep_knn = classification_report(y_test, y_test_pred_knn, target_names=label_encoder.classes_)
balanced_accuracy_knn = balanced_accuracy_score(y_test, y_test_pred_knn)

print("Best Hyperparameters for KNN:", grid_search_knn.best_params_)
print("KNN Test Balanced Accuracy:", balanced_accuracy_knn)
print("Classification Report:\n", classification_rep_knn)

Best Hyperparameters for KNN: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
KNN Test Balanced Accuracy: 0.7544953568272602
Classification Report:
               precision    recall  f1-score   support

  Downstairs       0.43      0.53      0.48       174
     Jogging       0.95      0.98      0.96       689
     Sitting       0.73      1.00      0.85        22
    Standing       1.00      0.72      0.84        43
    Upstairs       0.56      0.45      0.50       238
     Walking       0.86      0.84      0.85       768

    accuracy                           0.81      1934
   macro avg       0.76      0.75      0.75      1934
weighted avg       0.82      0.81      0.81      1934



Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



## Save the Models

In [9]:
import joblib

# Dictionary of trained models
trained_models = {
    "Gradient Boosting Machine": best_gbm,
    "Random Forest": best_rf,
    "K-Nearest Neighbors": best_knn
}

# Save each model
for model_name, model in trained_models.items():
    file_name = f"{model_name.replace(' ', '_').lower()}.joblib"
    joblib.dump(model, file_name)
    print(f"Saved {model_name} as {file_name}")

Saved Gradient Boosting Machine as gradient_boosting_machine.joblib
Saved Random Forest as random_forest.joblib
Saved K-Nearest Neighbors as k-nearest_neighbors.joblib


In [5]:
import joblib
joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

In [6]:
joblib.dump(label_encoder, "label_encoder.joblib")

['label_encoder.joblib']

## Predict the output for Kaggle dataset

In [10]:
# Load dataset
testing_kaggle_df = pd.read_csv("Updated_Metadata_Kaggle_2.csv")
X_test_ID = testing_kaggle_df["user_snippet"]
X_test_kaggle = testing_kaggle_df.drop(columns=["user_snippet"])  # Exclude non-numeric and target columns

# Standardize features
X_test_kaggle_scaled = scaler.transform(X_test_kaggle)

In [11]:
# For GBM
gbm = joblib.load("gradient_boosting_machine.joblib")
y_pred_probs_gbm = gbm.predict(X_test_kaggle_scaled)
y_pred_original_gbm = label_encoder.inverse_transform(y_pred_probs_gbm)
prediction_gbm = pd.DataFrame({"user_snippet": X_test_ID, "predicted" : y_pred_original_gbm})
prediction_gbm.to_csv("output_gbm.csv", index=False)

# For RF
rf = joblib.load("random_forest.joblib")
y_pred_probs_rf = rf.predict(X_test_kaggle_scaled)
y_pred_original_rf = label_encoder.inverse_transform(y_pred_probs_rf)
prediction_rf = pd.DataFrame({"user_snippet": X_test_ID, "predicted" : y_pred_original_rf})
prediction_rf.to_csv("output_rf.csv", index=False)

# For KNN
knn = joblib.load("k-nearest_neighbors.joblib")
y_pred_probs_knn = knn.predict(X_test_kaggle_scaled)
y_pred_original_knn = label_encoder.inverse_transform(y_pred_probs_knn)
prediction_knn = pd.DataFrame({"user_snippet": X_test_ID, "predicted" : y_pred_original_knn})
prediction_knn.to_csv("output_knn.csv", index=False)