In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('dataset2.csv')    # Load the dataset
df.head(5)

Unnamed: 0,activityID,heart_rate,hand temperature (°C),hand acceleration X ±16g,hand acceleration Y ±16g,hand acceleration Z ±16g,hand gyroscope X,hand gyroscope Y,hand gyroscope Z,hand magnetometer X,...,ankle acceleration X ±16g,ankle acceleration Y ±16g,ankle acceleration Z ±16g,ankle gyroscope X,ankle gyroscope Y,ankle gyroscope Z,ankle magnetometer X,ankle magnetometer Y,ankle magnetometer Z,PeopleId
0,transient activities,104.0,30.0,2.37223,8.60074,3.51048,-0.092217,0.056812,-0.015845,14.6806,...,9.65918,-1.65569,-0.099797,0.0083,0.00925,-0.01758,-61.1888,-38.9599,-58.1438,1
1,transient activities,104.0,30.0,2.18837,8.5656,3.66179,-0.024413,0.047759,0.006474,14.8991,...,9.6937,-1.57902,-0.215687,-0.006577,-0.004638,0.000368,-59.8479,-38.8919,-58.5253,1
2,transient activities,104.0,30.0,2.37357,8.60107,3.54898,-0.057976,0.032574,-0.006988,14.242,...,9.58944,-1.73276,0.092914,0.003014,0.000148,0.022495,-60.7361,-39.4138,-58.3999,1
3,transient activities,104.0,30.0,2.07473,8.52853,3.66021,-0.002352,0.03281,-0.003747,14.8908,...,9.58814,-1.7704,0.054545,0.003175,-0.020301,0.011275,-60.4091,-38.7635,-58.3956,1
4,transient activities,104.0,30.0,2.22936,8.83122,3.7,0.012269,0.018305,-0.053325,15.5612,...,9.69771,-1.65625,-0.060809,0.012698,-0.014303,-0.002823,-61.5199,-39.3879,-58.2694,1


In [4]:
print("Dimensions of the dataset:", df.shape)

Dimensions of the dataset: (2864056, 33)


In [5]:
## checking for NA values in all columns
df.isna().sum()

activityID                    0
heart_rate                   46
hand temperature (°C)         0
hand acceleration X ±16g      0
hand acceleration Y ±16g      0
hand acceleration Z ±16g      0
hand gyroscope X              0
hand gyroscope Y              0
hand gyroscope Z              0
hand magnetometer X           0
hand magnetometer Y           0
hand magnetometer Z           0
chest temperature (°C)        0
chest acceleration X ±16g     0
chest acceleration Y ±16g     0
chest acceleration Z ±16g     0
chest gyroscope X             0
chest gyroscope Y             0
chest gyroscope Z             0
chest magnetometer X          0
chest magnetometer Y          0
chest magnetometer Z          0
ankle temperature (°C)        0
ankle acceleration X ±16g     0
ankle acceleration Y ±16g     0
ankle acceleration Z ±16g     0
ankle gyroscope X             0
ankle gyroscope Y             0
ankle gyroscope Z             0
ankle magnetometer X          0
ankle magnetometer Y          0
ankle ma

In [6]:
# percent of missing values in each column
df.isnull().sum() * 100 / len(df)

activityID                   0.000000
heart_rate                   0.001606
hand temperature (°C)        0.000000
hand acceleration X ±16g     0.000000
hand acceleration Y ±16g     0.000000
hand acceleration Z ±16g     0.000000
hand gyroscope X             0.000000
hand gyroscope Y             0.000000
hand gyroscope Z             0.000000
hand magnetometer X          0.000000
hand magnetometer Y          0.000000
hand magnetometer Z          0.000000
chest temperature (°C)       0.000000
chest acceleration X ±16g    0.000000
chest acceleration Y ±16g    0.000000
chest acceleration Z ±16g    0.000000
chest gyroscope X            0.000000
chest gyroscope Y            0.000000
chest gyroscope Z            0.000000
chest magnetometer X         0.000000
chest magnetometer Y         0.000000
chest magnetometer Z         0.000000
ankle temperature (°C)       0.000000
ankle acceleration X ±16g    0.000000
ankle acceleration Y ±16g    0.000000
ankle acceleration Z ±16g    0.000000
ankle gyrosc

In [7]:
# imputing the missing values in bmi varible with the median value (choosing median to avoid outliers)

df['heart_rate'].fillna(df['heart_rate'].median(), inplace=True)

df.isna().sum()

activityID                   0
heart_rate                   0
hand temperature (°C)        0
hand acceleration X ±16g     0
hand acceleration Y ±16g     0
hand acceleration Z ±16g     0
hand gyroscope X             0
hand gyroscope Y             0
hand gyroscope Z             0
hand magnetometer X          0
hand magnetometer Y          0
hand magnetometer Z          0
chest temperature (°C)       0
chest acceleration X ±16g    0
chest acceleration Y ±16g    0
chest acceleration Z ±16g    0
chest gyroscope X            0
chest gyroscope Y            0
chest gyroscope Z            0
chest magnetometer X         0
chest magnetometer Y         0
chest magnetometer Z         0
ankle temperature (°C)       0
ankle acceleration X ±16g    0
ankle acceleration Y ±16g    0
ankle acceleration Z ±16g    0
ankle gyroscope X            0
ankle gyroscope Y            0
ankle gyroscope Z            0
ankle magnetometer X         0
ankle magnetometer Y         0
ankle magnetometer Z         0
PeopleId

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864056 entries, 0 to 2864055
Data columns (total 33 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   activityID                 object 
 1   heart_rate                 float64
 2   hand temperature (°C)      float64
 3   hand acceleration X ±16g   float64
 4   hand acceleration Y ±16g   float64
 5   hand acceleration Z ±16g   float64
 6   hand gyroscope X           float64
 7   hand gyroscope Y           float64
 8   hand gyroscope Z           float64
 9   hand magnetometer X        float64
 10  hand magnetometer Y        float64
 11  hand magnetometer Z        float64
 12  chest temperature (°C)     float64
 13  chest acceleration X ±16g  float64
 14  chest acceleration Y ±16g  float64
 15  chest acceleration Z ±16g  float64
 16  chest gyroscope X          float64
 17  chest gyroscope Y          float64
 18  chest gyroscope Z          float64
 19  chest magnetometer X       float64
 20  ch

In [10]:
# distribution of target variable
df.groupby('activityID').size()

activityID
Nordic walking          188107
ascending stairs        117216
cycling                 164600
descending stairs       104944
ironing                 238690
lying                   192523
rope jumping             42969
running                  98199
sitting                 185188
standing                189931
transient activities    927575
vacuum cleaning         175353
walking                 238761
dtype: int64

In [11]:
#converting target variable from int to str
df['activityID'] = df['activityID'].astype('str')

In [12]:
# dropping peopleID column
df.drop('PeopleId', axis=1, inplace=True)

In [13]:
# splitting data into x and y
df_X = df.drop(columns=['activityID'])
df_y = df['activityID']

In [14]:
# encoding the target column for xgboost model because the model expects the target variable in this format: [0 1 2 3 4 5...]

# from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()
# df_y = label_encoder.fit_transform(df_y)

In [17]:
# splitting the data into train and test
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.20, random_state=42)

## Decision Tree Classifier with Cost Complexity Pruning

In [18]:
# decision tree without pruning

chosen_max_depth = 7

# Create a decision tree classifier without pruning
tree = DecisionTreeClassifier(max_depth=chosen_max_depth, random_state=42)

# Train the model on the training dataset
tree.fit(df_X_train, df_y_train)

# Predict on the test dataset
y_pred_tree = tree.predict(df_X_test)

# Evaluate the decision tree based on f1-score
f1_tree = f1_score(df_y_test, y_pred_tree, average='weighted')

# Print the F1-score for the decision tree without pruning
print(f"Decision Tree without Pruning - F1-Score: {f1_tree:.2f}")

Decision Tree without Pruning - F1-Score: 0.63


In [31]:
## using COST COMPLEXITY PRUNING Technique

tree = DecisionTreeClassifier(random_state=42)

# Calculating alpha values for subtrees using cross-validation
path = tree.cost_complexity_pruning_path(df_X_train, df_y_train)
alphas = path.ccp_alphas

# Creating an array of pruned trees based on alpha values
pruned_trees = []
for alpha in alphas:
    pruned_tree = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
    pruned_tree.fit(df_X_train, df_y_train)
    pruned_trees.append(pruned_tree)

# Evaluating the pruned trees based on f1-score and choosing the best one
best_f1 = 0
best_tree = None
for pruned_tree in pruned_trees:
    y_pred = pruned_tree.predict(df_X_test)
    f1 = f1_score(df_y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_tree = pruned_tree

# Using the best pruned tree
print(f"Best Pruned Tree - F1-Score: {best_f1:.2f}")

In [None]:
# extracting features of df_X
features = df_X.columns.tolist()

# extracting the feature importance of the tree 
feature_importance = best_tree.feature_importances_

# Creating a dataframe with the feature importance
df_feature_importance= pd.DataFrame({"feature": features,
                                  "importance": feature_importance
                                  })

# listing the top 3 most importance features
top_features_dt = df_feature_importance.sort_values(by='importance', ascending=False).head(3)
top_features_dt

# plotting the barchart for feature importance
# plt.figure(figsize=(7,7))
# sns.barplot(df_feature_importance, 
#             x='importance', 
#             y='feature', 
#             color='lightblue', 
#             order=df_feature_importance.sort_values(by='importance', ascending=False).feature)
# plt.show()

## Random Forest Model with hypertuned parameters

In [16]:
# random forest without grid search

from sklearn.ensemble import RandomForestClassifier

chosen_n_estimators = 200
chosen_max_depth = 10

# Create a random forest clf with the specified hyperparameters
forest = RandomForestClassifier(n_estimators=chosen_n_estimators, max_depth=chosen_max_depth)

# Train the model on the training dataset
forest.fit(df_X_train, df_y_train)

# Predict on the test dataset
y_pred_rf = forest.predict(df_X_test)

# Print the evaluation metrics and confusion matrix for the test set
print("\nClassification Report:")
print(classification_report(df_y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(df_y_test, y_pred_rf))

# Print the chosen hyperparameters
print(f"Chosen n_estimators: {chosen_n_estimators}")
print(f"Chosen max_depth: {chosen_max_depth}")

In [14]:
from sklearn.ensemble import RandomForestClassifier

# create the parameter grid with values to explore
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [5, 10, 15, 20, None]
}

# create a new random forest clf
forest2 = RandomForestClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=forest2, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(df_X_train, df_y_train)

# Access the optimal hyperparameters
optimal_n_estimators = grid_search.best_params_['n_estimators']
optimal_max_depth = grid_search.best_params_['max_depth']

# Train the optimal model with the best hyperparameters
optimal_forest = RandomForestClassifier(n_estimators=optimal_n_estimators, max_depth=optimal_max_depth)
optimal_forest.fit(df_X_train, df_y_train)

# Evaluate the optimal model on the test dataset
y_pred_optimal_rf = optimal_forest.predict(df_X_test)

# Print the evaluation metrics and confusion matrix for Test Set
print("\nClassification Report:")
print(classification_report(df_y_test, y_pred_optimal_rf))
print("Confusion Matrix:")
print(confusion_matrix(df_y_test, y_pred_optimal_rf))

# Print the optimal hyperparameters
print(f"Optimal n_estimators: {optimal_n_estimators}")
print(f"Optimal max_depth: {optimal_max_depth}")

In [None]:
# extracting features of df_X
features = df_X.columns.tolist()

# extracting the feature importance of the random forest 
feature_importance = optimal_forest.feature_importances_

# Creating a dataframe with the feature importance
df_feature_importance2 = pd.DataFrame({"feature": features,
                                       "importance": feature_importance
                                       })

# plotting the barchart for feature importance
# plt.figure(figsize=(7,7))
# sns.barplot(df_feature_importance2, 
#             x='importance', 
#             y='feature', 
#             color='lightblue', 
#             order=df_feature_importance2.sort_values(by='importance', ascending=False).feature)
# plt.show()

# top 3 features in random forest
top_features_rf = df_feature_importance2.sort_values(by='importance', ascending=False).head(3)
top_features_rf

## XGBoost Model

In [15]:
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Perform grid search with 10-fold cross-validation
grid_search_xgb = GridSearchCV(xgb_classifier, param_grid_xgb, cv=10, scoring='f1')
grid_search_xgb.fit(df_X_train, df_y_train)

# Access the best hyperparameters
best_params_xgb = grid_search_xgb.best_params_

# Train the optimal XGBoost model
optimal_xgb = xgb.XGBClassifier(n_estimators=best_params_xgb['n_estimators'], 
                                learning_rate=best_params_xgb['learning_rate'], 
                                max_depth=best_params_xgb['max_depth'])
optimal_xgb.fit(df_X_train, df_y_train)

# predict the XGBoost model on the test dataset
y_pred_xgb = optimal_xgb.predict(df_X_test)

# Compare the performances with other models
print("Optimal hyperparameters for XGBoost: ", best_params_xgb)
print("Classification Report for XGBoost:")
print(classification_report(df_y_test, y_pred_xgb))

In [None]:
# extracting the feature importance of the xgboost model
feature_importance = optimal_xgb.feature_importances_

# Creating a dataframe with the feature importance
df_feature_importance_xgb = pd.DataFrame({"feature": features,
                                  "importance": feature_importance
                                  })

# top 3 features in xgboost
top_features_xgb = df_feature_importance_xgb.sort_values(by='importance', ascending=False).head(3)
top_features_xgb

# Comparing Models

In [None]:
# top 3 features in decision tree
print("Top 3 features in Decision Tree:")
display(top_features_dt)

# top 3 features in random forest
print("Top 3 features in Random Forest:")
display(top_features_rf)

# top 3 features in XGBoost
print("Top 3 features in XGBoost:")
display(top_features_xgb)

In [None]:
#comparing the performance of the three models

print("F1-score for Decision Tree:", best_f1)
print("F1-score for Random Forest:", round(f1_score(df_y_test, y_pred_optimal_rf),2))
print("F1-score for XGBoost:", round(f1_score(df_y_test, y_pred_xgb),2))