In [6]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTEENN
import numpy as np
import pandas as pd

seed = 42

# Loading dataset
data = pd.read_csv('har_train.csv')
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Label Encoding for y
labelen = LabelEncoder()
yencoded = labelen.fit_transform(y)

from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, yencoded, test_size=0.2, stratify=yencoded, random_state=seed)

# Applying SMOTEENN
from imblearn.combine import SMOTETomek

st = SMOTETomek(random_state=seed)
Xresamp, yresamp = st.fit_resample(Xtrain, ytrain)

from collections import Counter
print(f"New class distribution: {Counter(yresamp)}")

# Class weights for Random Forest
rfweights = {i: 1/count for i, count in Counter(yresamp).items()}

# Class weights for XGBoost
cw = compute_class_weight('balanced', classes=np.unique(yresamp), y=yresamp)
cwdict = dict(zip(np.unique(yresamp), cw))
sw = np.array([cwdict[label] for label in yresamp])

# Parameter grid for RF
rf = RandomForestClassifier(class_weight=rfweights, random_state=seed)
rfparameters = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2']
}

"""  These are the best training parameter grid used
    'n_estimators': [100],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']"""

# Parameter grid for XGB (Best parameters)
xgb = XGBClassifier(random_state=seed, eval_metric='mlogloss')
xgbparameters = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

""" These are the best param grid for XGB
    'n_estimators': [200],
    'max_depth': [10],
    'learning_rate': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [1]"""

# K-Fold Cross-Validation
nsplits = 5
kf = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=seed)

# Cross-validation for Random Forest
print("\nPerforming K-Fold cross-validation for Random Forest...")
rfgrid = GridSearchCV(rf, rfparameters, scoring='f1_macro', cv=kf, n_jobs=-1)
rfgrid.fit(Xresamp, yresamp)
rfbest = rfgrid.best_estimator_

# Best parameters and cross-validation score for Random Forest
print("Best Parameters for Random Forest:", rfgrid.best_params_)
print("Best Cross-Validation F1 Macro Score for Random Forest:", rfgrid.best_score_)

# Cross-validation for XGBoost
print("\nPerforming K-Fold cross-validation for XGBoost...")
xgbgrid = GridSearchCV(xgb, xgbparameters, scoring='f1_macro', cv=kf, n_jobs=-1)
xgbgrid.fit(Xresamp, yresamp, sample_weight=sw)
xgbbest = xgbgrid.best_estimator_

# Best parameters and cross-validation score for XGBoost
print("Best Parameters for XGBoost:", xgbgrid.best_params_)
print("Best Cross-Validation F1 Macro Score for XGBoost:", xgbgrid.best_score_)

# Stacking Model with RF and XGB
estimators = [
    ('rf', rfbest),
    ('xgb', xgbbest)
]
mmodel = LogisticRegression(random_state=seed)

finalclf = StackingClassifier(
    estimators=estimators,
    final_estimator=mmodel,
    cv=kf,
    n_jobs=-1
)

# Cross-validation
print("\nPerforming K-Fold cross-validation for Stacking Model...")
scores = cross_val_score(finalclf, Xresamp, yresamp, cv=kf, scoring='f1_macro', n_jobs=-1)
print("Stacking Model Cross-Validation F1 Macro Scores:", scores)
print("Average F1 Macro Score for Stacking Model:", np.mean(scores))

finalclf.fit(Xresamp, yresamp)

#Validation test
ypred = finalclf.predict(Xtest)

print("\nTest Set Evaluation:")
print("Test Report:\n", classification_report(ytest, ypred, target_names=labelen.classes_))
print("Confusion Matrix (Test):\n", confusion_matrix(ytest, ypred))

New class distribution: Counter({1: 557, 8: 557, 3: 557, 2: 557, 4: 557, 7: 554, 10: 554, 0: 552, 5: 551, 9: 547, 6: 544})

Performing K-Fold cross-validation for Random Forest...
Best Parameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation F1 Macro Score for Random Forest: 0.9550925416266495

Performing K-Fold cross-validation for XGBoost...
Best Parameters for XGBoost: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8}
Best Cross-Validation F1 Macro Score for XGBoost: 0.9677326905863056

Performing K-Fold cross-validation for Stacking Model...
Stacking Model Cross-Validation F1 Macro Scores: [0.97757153 0.96920463 0.96869913 0.96960537 0.97766717]
Average F1 Macro Score for Stacking Model: 0.9725495679460924

Test Set Evaluation:
Test Report:
                    precision    recall  f1-score   support

          Cycling       0

In [10]:
import joblib

# Saving the trained model
joblib.dump(finalclf, 'T2.joblib')

['T2.joblib']