## Data Cleaning and preprocessing

In [9]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)


In [10]:
df = pd.read_csv("sensoringData_feature_prepared_20_19.0_0_split_1.csv")
df.head()

Unnamed: 0,id,user,timestamp,acc_xs_mean,acc_ys_mean,acc_zs_mean,acc_xs_var,acc_ys_var,acc_zs_var,acc_xs_mad,...,gps_bearing_min,gps_accuracy_min,gps_lat_iqr,gps_long_iqr,gps_alt_iqr,gps_speed_iqr,gps_bearing_iqr,gps_accuracy_iqr,activity_id,activity
0,5372713,11,1570541000.0,0.041235,0.074212,-0.00134,1.35048,1.55056,4.986602,0.590246,...,223.478317,6.0,0.000169,0.000128,3.877194,0.181761,13.691864,0.0,-497,Walking
1,5372818,11,1570541000.0,0.067678,0.075321,-0.005949,1.320684,1.509417,5.022726,0.590798,...,223.478317,6.0,0.000169,0.000128,3.877194,0.181761,13.691864,0.0,-497,Walking
2,5372924,11,1570541000.0,0.074119,0.107075,-0.02008,1.252855,1.481293,4.945844,0.575731,...,223.478317,6.0,0.000169,0.000128,3.877194,0.181761,13.691864,0.0,-497,Walking
3,5373030,11,1570541000.0,0.08147,0.088129,-0.017702,1.193755,1.453578,5.022796,0.573366,...,223.478317,6.0,9.6e-05,8e-05,1.938597,0.174963,6.943611,0.0,-497,Walking
4,5373135,11,1570541000.0,0.08144,0.082775,-0.041844,1.235434,1.469258,5.054491,0.592963,...,223.478317,6.0,2.4e-05,3.2e-05,1.057135,0.168166,0.195358,0.0,-497,Walking


In [3]:
df.shape

(60733, 59)

In [4]:
df[df.gps_lat_max>0.2].head()

Unnamed: 0,id,user,timestamp,acc_xs_mean,acc_ys_mean,acc_zs_mean,acc_xs_var,acc_ys_var,acc_zs_var,acc_xs_mad,...,gps_bearing_min,gps_accuracy_min,gps_lat_iqr,gps_long_iqr,gps_alt_iqr,gps_speed_iqr,gps_bearing_iqr,gps_accuracy_iqr,activity_id,activity


In [5]:
df.columns


Index(['id', 'user', 'timestamp', 'acc_xs_mean', 'acc_ys_mean', 'acc_zs_mean',
       'acc_xs_var', 'acc_ys_var', 'acc_zs_var', 'acc_xs_mad', 'acc_ys_mad',
       'acc_zs_mad', 'acc_xs_max', 'acc_ys_max', 'acc_zs_max', 'acc_xs_min',
       'acc_ys_min', 'acc_zs_min', 'acc_xs_iqr', 'acc_ys_iqr', 'acc_zs_iqr',
       'gps_lat_mean', 'gps_long_mean', 'gps_alt_mean', 'gps_speed_mean',
       'gps_bearing_mean', 'gps_accuracy_mean', 'gps_lat_var', 'gps_long_var',
       'gps_alt_var', 'gps_speed_var', 'gps_bearing_var', 'gps_accuracy_var',
       'gps_lat_mad', 'gps_long_mad', 'gps_alt_mad', 'gps_speed_mad',
       'gps_bearing_mad', 'gps_accuracy_mad', 'gps_lat_max', 'gps_long_max',
       'gps_alt_max', 'gps_speed_max', 'gps_bearing_max', 'gps_accuracy_max',
       'gps_lat_min', 'gps_long_min', 'gps_alt_min', 'gps_speed_min',
       'gps_bearing_min', 'gps_accuracy_min', 'gps_lat_iqr', 'gps_long_iqr',
       'gps_alt_iqr', 'gps_speed_iqr', 'gps_bearing_iqr', 'gps_accuracy_iqr',
       'a

In [11]:
lat_threshold = 0.2
long_threshold = 0.2
alt_threshold = 500

# Filter out rows where GPS data exceeds the thresholds
df1 = df[
    (df['gps_lat_mean'].abs() <= lat_threshold) &
    (df['gps_long_mean'].abs() <= long_threshold) &
    (df['gps_alt_mean'].abs() <= alt_threshold)
]

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Assuming df1 is already loaded and cleaned

# Step 1: Prepare the feature matrix (X) and target vector (y)
# Only drop columns that are not relevant for training (excluding 'activity' and 'timestamp')
X = df1.drop(columns=['id', 'activity_id', 'user', 'timestamp', 'activity'])  # Features
y = df1['activity']  # Target variable ('walking', 'inactive', 'active', 'driving')

# Check if X has valid data
print("Shape of X:", X.shape)  # Check the shape of the feature matrix

# Handle missing data in X
X = X.dropna()  # Dropping rows with missing values in features

# If X is still empty, that means no data passed the dropna step, which means something went wrong
if X.shape[0] == 0:
    raise ValueError("No valid rows left in the feature set after handling missing data.")

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the features (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the SVM model with an RBF kernel
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Step 5: Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Shape of X: (60733, 54)
Accuracy: 0.9697044537745946

Classification Report:
               precision    recall  f1-score   support

      Active       0.95      0.99      0.97      4419
     Driving       0.99      0.98      0.99      4327
    Inactive       0.93      0.97      0.95      1046
     Walking       0.98      0.92      0.95      2355

    accuracy                           0.97     12147
   macro avg       0.96      0.96      0.96     12147
weighted avg       0.97      0.97      0.97     12147



In [22]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


# Step 1: Prepare the feature matrix (X) and target vector (y)
X = df1.drop(columns=['id', 'activity_id', 'user', 'timestamp', 'activity'])  # Features
y = df1['activity']  # Target variable ('walking', 'inactive', 'active', 'driving')

# Check if X has valid data
print("Shape of X:", X.shape)  # Check the shape of the feature matrix

# Handle missing data in X
X = X.dropna()  # Dropping rows with missing values in features

# If X is still empty, that means no data passed the dropna step, which means something went wrong
if X.shape[0] == 0:
    raise ValueError("No valid rows left in the feature set after handling missing data.")

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the features (important for models that are sensitive to scale like SVM, but optional for Decision Trees)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the Decision Tree model with a maximum depth of 5
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train_scaled, y_train)

# Step 5: Make predictions on the test set
y_pred = dt_model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Shape of X: (60733, 54)
Accuracy: 0.9337284926319256

Classification Report:
               precision    recall  f1-score   support

      Active       0.90      0.96      0.93      4419
     Driving       0.96      0.97      0.97      4327
    Inactive       1.00      0.86      0.93      1046
     Walking       0.93      0.85      0.89      2355

    accuracy                           0.93     12147
   macro avg       0.95      0.91      0.93     12147
weighted avg       0.94      0.93      0.93     12147



In [23]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Assuming df1 is already loaded and cleaned

# Step 1: Prepare the feature matrix (X) and target vector (y)
X = df1.drop(columns=['id', 'activity_id', 'user', 'timestamp', 'activity'])  # Features
y = df1['activity']  # Target variable ('walking', 'inactive', 'active', 'driving')

# Check if X has valid data
print("Shape of X:", X.shape)  # Check the shape of the feature matrix

# Handle missing data in X
X = X.dropna()  # Dropping rows with missing values in features

# If X is still empty, that means no data passed the dropna step, which means something went wrong
if X.shape[0] == 0:
    raise ValueError("No valid rows left in the feature set after handling missing data.")

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the features (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the MLP model with 100 hidden units
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

# Step 5: Make predictions on the test set
y_pred = mlp_model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Shape of X: (60733, 54)
Accuracy: 0.9984358277764057

Classification Report:
               precision    recall  f1-score   support

      Active       1.00      1.00      1.00      4419
     Driving       1.00      1.00      1.00      4327
    Inactive       1.00      1.00      1.00      1046
     Walking       0.99      1.00      1.00      2355

    accuracy                           1.00     12147
   macro avg       1.00      1.00      1.00     12147
weighted avg       1.00      1.00      1.00     12147



In [24]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Assuming df1 is already loaded and cleaned

# Step 1: Prepare the feature matrix (X) and target vector (y)
X = df1.drop(columns=['id', 'activity_id', 'user', 'timestamp', 'activity'])  # Features
y = df1['activity']  # Target variable ('walking', 'inactive', 'active', 'driving')

# Check if X has valid data
print("Shape of X:", X.shape)  # Check the shape of the feature matrix

# Handle missing data in X
X = X.dropna()  # Dropping rows with missing values in features

# If X is still empty, that means no data passed the dropna step, which means something went wrong
if X.shape[0] == 0:
    raise ValueError("No valid rows left in the feature set after handling missing data.")

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the features (Naive Bayes can be sensitive to feature scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the Naive Bayes model (GaussianNB)
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

# Step 5: Make predictions on the test set
y_pred = nb_model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)


Shape of X: (60733, 54)
Accuracy: 0.7587881781509838

Classification Report:
               precision    recall  f1-score   support

      Active       0.86      0.66      0.74      4419
     Driving       0.97      0.84      0.90      4327
    Inactive       0.31      0.93      0.46      1046
     Walking       0.91      0.72      0.81      2355

    accuracy                           0.76     12147
   macro avg       0.76      0.79      0.73     12147
weighted avg       0.86      0.76      0.79     12147



In [25]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Assuming df1 is already loaded and cleaned

# Step 1: Prepare the feature matrix (X) and target vector (y)
X = df1.drop(columns=['id', 'activity_id', 'user', 'timestamp', 'activity'])  # Features
y = df1['activity']  # Target variable ('walking', 'inactive', 'active', 'driving')

# Check if X has valid data
print("Shape of X:", X.shape)  # Check the shape of the feature matrix

# Handle missing data in X
X = X.dropna()  # Dropping rows with missing values in features

# If X is still empty, that means no data passed the dropna step, which means something went wrong
if X.shape[0] == 0:
    raise ValueError("No valid rows left in the feature set after handling missing data.")

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the features (KNN is sensitive to feature scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the KNN model with 34 neighbors
knn_model = KNeighborsClassifier(n_neighbors=34)
knn_model.fit(X_train_scaled, y_train)

# Step 5: Make predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)


Shape of X: (60733, 54)
Accuracy: 0.9781015888696798

Classification Report:
               precision    recall  f1-score   support

      Active       0.97      0.99      0.98      4419
     Driving       0.98      0.98      0.98      4327
    Inactive       0.99      0.97      0.98      1046
     Walking       0.97      0.95      0.96      2355

    accuracy                           0.98     12147
   macro avg       0.98      0.97      0.98     12147
weighted avg       0.98      0.98      0.98     12147



In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Assuming df1 is already loaded and cleaned

# Step 1: Prepare the feature matrix (X) and target vector (y)
X = df1.drop(columns=['id', 'activity_id', 'user', 'timestamp', 'activity'])  # Features
y = df1['activity']  # Target variable ('walking', 'inactive', 'active', 'driving')

# Check if X has valid data
print("Shape of X:", X.shape)  # Check the shape of the feature matrix

# Handle missing data in X
X = X.dropna()  # Dropping rows with missing values in features

# If X is still empty, that means no data passed the dropna step, which means something went wrong
if X.shape[0] == 0:
    raise ValueError("No valid rows left in the feature set after handling missing data.")

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the features (Random Forest is not very sensitive to feature scaling, but it's still a good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the Random Forest model with 1000 trees (estimators)
rf_model = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Step 5: Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)


Shape of X: (60733, 54)
Accuracy: 0.9997530254383798

Classification Report:
               precision    recall  f1-score   support

      Active       1.00      1.00      1.00      4419
     Driving       1.00      1.00      1.00      4327
    Inactive       1.00      1.00      1.00      1046
     Walking       1.00      1.00      1.00      2355

    accuracy                           1.00     12147
   macro avg       1.00      1.00      1.00     12147
weighted avg       1.00      1.00      1.00     12147



In [29]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Assuming df1 is already loaded and cleaned

# Step 1: Prepare the feature matrix (X) and target vector (y)
X = df1.drop(columns=['id', 'activity_id', 'user', 'timestamp', 'activity'])  # Features
y = df1['activity']  # Target variable ('Walking', 'Inactive', 'Active', 'Driving')

# Check if X has valid data
print("Shape of X:", X.shape)  # Check the shape of the feature matrix

# Handle missing data in X
X = X.dropna()  # Dropping rows with missing values in features

# If X is still empty, that means no data passed the dropna step, which means something went wrong
if X.shape[0] == 0:
    raise ValueError("No valid rows left in the feature set after handling missing data.")

# Step 2: Label encode the target variable (y) into numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Scale the features (XGBoost is generally not sensitive to feature scaling, but it's good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Train the XGBoost model with specified hyperparameters
# Try max_depth=5 and max_depth=8, and min_child_weight=1 and min_child_weight=3
xgb_model_5_1 = XGBClassifier(n_estimators=1200, max_depth=5, min_child_weight=1, random_state=42)
xgb_model_5_1.fit(X_train_scaled, y_train)

xgb_model_5_3 = XGBClassifier(n_estimators=1200, max_depth=5, min_child_weight=3, random_state=42)
xgb_model_5_3.fit(X_train_scaled, y_train)

xgb_model_8_1 = XGBClassifier(n_estimators=1200, max_depth=8, min_child_weight=1, random_state=42)
xgb_model_8_1.fit(X_train_scaled, y_train)

xgb_model_8_3 = XGBClassifier(n_estimators=1200, max_depth=8, min_child_weight=3, random_state=42)
xgb_model_8_3.fit(X_train_scaled, y_train)

# Step 6: Evaluate each model
models = [xgb_model_5_1, xgb_model_5_3, xgb_model_8_1, xgb_model_8_3]
for model in models:
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    print(f"Model: {model} Accuracy: {accuracy}\nClassification Report:\n{report}\n")


Shape of X: (60733, 54)
Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=1200, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...) Accuracy: 0.9999176751461266
Classification Report:
              precision    recall  f1-score   support

      Active       1.00      1.00      1.00      4419
     Driving       1.00      1.00      1.00    