In [1]:
#importing libraries required for the project
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans


In [2]:
data = pd.read_csv('D:/Project_hand_washing/OCDetect_Export/OCDetect_Export/OCDetect_03_recording_06_c076109d-651c-46c4-a745-5df8b383bec3.csv')

Reading and processing data

In [185]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2661186 entries, 0 to 2661185
Data columns (total 14 columns):
 #   Column       Dtype  
---  ------       -----  
 0   timestamp    float64
 1   datetime     object 
 2   acc x        float64
 3   acc y        float64
 4   acc z        float64
 5   gyro x       float64
 6   gyro y       float64
 7   gyro z       float64
 8   user yes/no  float64
 9   compulsive   float64
 10  urge         float64
 11  tense        float64
 12  ignore       int64  
 13  relabeled    int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 284.2+ MB


In [3]:
label_counts = data['relabeled'].value_counts()
count_label_0 = label_counts.get(0, 0)  # Count of label 0, default to 0 if not found
count_label_1 = label_counts.get(1, 0)  # Count of label 1, default to 0 if not found
count_label_2 = label_counts.get(2, 0)  # Count of label 2, default to 0 if not found
if(count_label_0 & count_label_1 & count_label_2):
    print(count_label_0)
    print(count_label_1)
    print(count_label_2)

2626968
22812
11406


In [4]:
filtered_data = data[(data['ignore'] == 0)]

In [5]:
#new_data=data.drop(['timestamp','datetime','user yes/no','compulsive', 'urge', 'tense', 'ignore'],axis=1)
new_data=filtered_data.drop(['timestamp','datetime','user yes/no','compulsive', 'urge', 'tense', 'ignore'],axis=1)

In [6]:
features = new_data.drop(['relabeled'],axis=1)

In [7]:
# choosing relabeled as label value
label = new_data['relabeled']  # Create a 'label' column for majority voting

In [8]:
# Split data into train and test sets
train_data, test_data = train_test_split(new_data, test_size=0.4, random_state=42)
print(f"Number of train DataFrames: {len(train_data)}")
print(f"Number of test DataFrames: {len(test_data)}")

Number of train DataFrames: 1054463
Number of test DataFrames: 702976


In [9]:
data_a=train_data[train_data["relabeled"]==0][:6864]
data_b=train_data[train_data["relabeled"]==1][:6864]
data_c=train_data[train_data["relabeled"]==2][:6864]

In [10]:
ttrain_final = pd.concat([data_a,data_b,data_c],axis=0)

In [11]:
label_counts = test_data['relabeled'].value_counts()
count_label_0 = label_counts.get(0, 0)  # Count of label 0, default to 0 if not found
count_label_1 = label_counts.get(1, 0)  # Count of label 1, default to 0 if not found
count_label_2 = label_counts.get(2, 0)  # Count of label 2, default to 0 if not found
if(count_label_0 & count_label_1 & count_label_2):
    print(count_label_0)
    print(count_label_1)
    print(count_label_2)

689419
9015
4542


### Creating sliding windows and majority voting

In [12]:
# Define window size
window_size = 50
stride = 10
train_data_val = ttrain_final.values

num_windows = len(train_data_val) - window_size + 1
# Initialize lists to store windowed data and labels
train_windows = []
train_labels = []


In [13]:
for i in range(0, num_windows, stride):
    window = ttrain_final.iloc[i:i+window_size, :-1]   # Select only sensor axes columns
    label_window = ttrain_final.iloc[i:i+window_size, -1]  # Select the label column for majority voting
    majority_label = np.bincount(label_window).argmax()  # Majority voting
    train_windows.append(window)  # Convert window to numpy array
    train_labels.append(majority_label)

In [14]:

train_windows = np.array(train_windows)
train_labels = np.array(train_labels)
# Flatten the sensor axes data in each window
num_trainsamp, num_fea_trainsamp = train_windows.shape[0], np.prod(train_windows.shape[1:])
train_windows_reshaped = train_windows.reshape(num_trainsamp, num_fea_trainsamp)
print(train_windows_reshaped)
print(len(np.unique(train_labels)))


[[ 3.8344264e+00 -8.2652130e+00  4.6175980e+00 ...  4.0317092e-02
   1.7104220e-02  3.6651902e-03]
 [ 3.0320950e+00 -6.4665530e-01  8.6053060e+00 ...  7.2448593e-01
  -1.5638144e-01 -1.9058989e-01]
 [ 1.9160157e+00 -5.9420440e+00 -7.4030056e+00 ...  2.6878060e-02
  -2.4434600e-03  3.6651902e-03]
 ...
 [-1.5974782e+00 -9.1753210e+00 -5.1516870e+00 ...  7.3303800e-01
   4.9480066e-01 -1.1850781e-01]
 [ 1.2406201e+01  1.8441651e+00  1.1591895e+00 ...  4.7036606e-01
  -9.7371880e-01 -4.4959664e-01]
 [-5.2019825e+00 -3.8990920e+00  3.2213013e+00 ...  1.2168431e+00
   1.9058989e-01  3.8728842e-01]]
3


In [15]:
print(train_windows_reshaped.shape)
print(train_labels.shape)

(2055, 300)
(2055,)


In [16]:
import random


In [17]:
# Generate windows and labels using a sliding window approach
window = []
label_window = []
test_windows = []
test_labels = []
for i in range(0,len(test_data)-window_size+1,stride):
    window = test_data.iloc[i:i+window_size, :-1]  # Select only sensor axes columns
    test_windows.append(window)
    label_window = test_data.iloc[i:i+window_size,-1]  # Select the label column for majority voting
    majority_label = random.choice([0, 1, 2])  # Majority voting
    test_labels.append(majority_label)

In [18]:
test_windows = np.array(test_windows)
test_labels = np.array(test_labels)
# Flatten the sensor axes data in each window
num_trainsamp, num_fea_trainsamp = test_windows.shape[0], np.prod(test_windows.shape[1:])
test_windows_reshaped = test_windows.reshape(num_trainsamp, num_fea_trainsamp)
print(test_windows_reshaped)
print(len(np.unique(test_labels)))

[[ 8.1789920e+00 -6.1360400e+00  5.9923390e+00 ...  1.7104220e-01
  -1.4795151e+00 -1.1777477e+00]
 [ 2.1794678e-01 -1.3206139e+01 -5.4055595e+00 ...  8.1855910e-02
  -4.2760550e-02  5.2534390e-02]
 [ 7.3718705e+00  4.6175980e+00 -1.0777588e-01 ...  6.1086502e-02
   7.8312890e-01 -2.3945908e-01]
 ...
 [-1.2286451e+00 -6.9168167e+00 -7.7143583e+00 ...  2.1991141e-02
   1.9535463e+00  9.8104920e-01]
 [-6.3491970e+00 -6.0546100e+00  4.6439430e+00 ... -1.2217300e-03
  -1.2217300e-02  8.5521100e-03]
 [-6.3348270e+00 -6.6797100e+00  3.8344264e+00 ...  9.9693170e-01
   1.3341292e+00 -2.8099790e-02]]
3


In [19]:
print(train_windows_reshaped.shape)
print(train_labels.shape)

(2055, 300)
(2055,)


Random Forest Classifier

In [20]:
clf = RandomForestClassifier(n_estimators=10,max_depth=9, random_state=6, class_weight='balanced',min_samples_split=12)
clf.fit(train_windows_reshaped, train_labels)

In [21]:
# Make predictions on the test set with SVM
y_pred = clf.predict(test_windows_reshaped)

# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred)
print(accuracy)

report = classification_report(test_labels, y_pred)
# Print the report
print(report)

0.3318111333987737
              precision    recall  f1-score   support

           0       0.33      0.75      0.46     23360
           1       0.34      0.10      0.16     23460
           2       0.33      0.14      0.20     23473

    accuracy                           0.33     70293
   macro avg       0.33      0.33      0.27     70293
weighted avg       0.33      0.33      0.27     70293



## Naives Bayes

In [22]:
# Create and train a Gaussian Naive Bayes classifier
# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights /= class_weights.sum()
# Create a dictionary to map class labels to their corresponding weights
class_weight_dict = dict(zip(np.unique(test_labels), class_weights))
NBclf = GaussianNB(priors=class_weights)
NBclf.fit(train_windows_reshaped, train_labels)

# Cross-validation
cv_scores = cross_val_score(NBclf, train_windows_reshaped, train_labels, cv=3)
print("Cross-Validation Scores:", cv_scores)

Cross-Validation Scores: [0.67445255 0.6350365  0.6379562 ]


In [23]:
# Evaluate the model
y_pred = NBclf.predict(test_windows_reshaped)
accuracy = accuracy_score(test_labels, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')
report = classification_report(test_labels, y_pred)
# Print the report
print(report)

Test Accuracy: 0.3323
              precision    recall  f1-score   support

           0       0.33      0.92      0.49     23360
           1       0.33      0.04      0.07     23460
           2       0.32      0.05      0.08     23473

    accuracy                           0.33     70293
   macro avg       0.33      0.33      0.21     70293
weighted avg       0.33      0.33      0.21     70293



## Logistic Regression

In [24]:
lclf = LogisticRegression(random_state=42)
lclf.fit(train_windows_reshaped, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
# Make predictions on the test set with logistic regression
y_pred = lclf.predict(test_windows_reshaped)

# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred)
print(accuracy)

report = classification_report(test_labels, y_pred)
# Print the report
print(report)

0.3338739277026162
              precision    recall  f1-score   support

           0       0.33      0.78      0.47     23360
           1       0.35      0.08      0.13     23460
           2       0.33      0.14      0.20     23473

    accuracy                           0.33     70293
   macro avg       0.34      0.33      0.27     70293
weighted avg       0.34      0.33      0.27     70293



In [26]:
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(train_windows_reshaped, train_labels)

# Predictions
xgb_predictions = xgb_classifier.predict(test_windows_reshaped)
# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred)
print(accuracy)

report = classification_report(test_labels, y_pred)
# Print the report
print(report)

0.3338739277026162
              precision    recall  f1-score   support

           0       0.33      0.78      0.47     23360
           1       0.35      0.08      0.13     23460
           2       0.33      0.14      0.20     23473

    accuracy                           0.33     70293
   macro avg       0.34      0.33      0.27     70293
weighted avg       0.34      0.33      0.27     70293

