In [1]:
#importing libraries required for the project
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_class_weight
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from datetime import datetime

Reading and processing data

In [2]:
data = pd.read_csv('./OCDetect_Export/OCDetect_Export/OCDetect_03_recording_06_c076109d-651c-46c4-a745-5df8b383bec3.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2661186 entries, 0 to 2661185
Data columns (total 14 columns):
 #   Column       Dtype  
---  ------       -----  
 0   timestamp    float64
 1   datetime     object 
 2   acc x        float64
 3   acc y        float64
 4   acc z        float64
 5   gyro x       float64
 6   gyro y       float64
 7   gyro z       float64
 8   user yes/no  float64
 9   compulsive   float64
 10  urge         float64
 11  tense        float64
 12  ignore       int64  
 13  relabeled    int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 284.2+ MB


In [3]:
label_counts = data['relabeled'].value_counts()
count_label_0 = label_counts.get(0, 0)  # Count of label 0, default to 0 if not found
count_label_1 = label_counts.get(1, 0)  # Count of label 1, default to 0 if not found
count_label_2 = label_counts.get(2, 0)  # Count of label 2, default to 0 if not found
if(count_label_0 & count_label_1 & count_label_2):
    print(count_label_0)
    print(count_label_1)
    print(count_label_2)

2626968
22812
11406


In [4]:
filtered_data = data[(data['ignore'] == 0)]

In [61]:
# Check for duplicate rows
duplicate_rows = filtered_data[filtered_data.duplicated()]

# Print or inspect duplicate rows
print(duplicate_rows)

Empty DataFrame
Columns: [timestamp, datetime, acc x, acc y, acc z, gyro x, gyro y, gyro z, user yes/no, compulsive, urge, tense, ignore, relabeled]
Index: []


In [5]:
# Convert 'timestamp_column' to datetime if it's not already
#filtered_data['datetime'] = pd.to_datetime(your_data['timestamp_column'], errors='coerce')

# Find the start indices where 'relabeled' changes to 2
start_indices = filtered_data.index[(filtered_data['relabeled'] == 2) & (filtered_data['relabeled'].shift(1).isin([1, 0]))]

# Find the end indices where 'relabeled' is 2
end_indices = filtered_data.index[(filtered_data['relabeled'] == 2) & (~filtered_data['relabeled'].shift(-1).isin([2]))]

# Create intervals based on the start and end indices
intervals = []
for start_idx, end_idx in zip(start_indices, end_indices):
    interval_data = filtered_data.loc[start_idx:end_idx]
    
    # Calculate the duration of occurrences where 'relabeled' == 2 in each interval
    duration_seconds = (interval_data['timestamp'].max() - interval_data['timestamp'].min())
    
    intervals.append({'start_time': interval_data['timestamp'].min(), 'end_time': interval_data['timestamp'].max(), 'duration_seconds': duration_seconds})

# Convert the list of intervals to a DataFrame
intervals_df = pd.DataFrame(intervals) / 10**9
print(intervals_df)
#print(intervals_df)


   start_time  end_time  duration_seconds
0      396.08    434.08              38.0
1     4781.14   4819.14              38.0
2     7948.52   7986.52              38.0
3    35680.76  35718.76              38.0
4    46789.64  46827.64              38.0
5    48409.62  48447.62              38.0


In [67]:
# Convert 'timestamp_column' to datetime if it's not already
#filtered_data['datetime'] = pd.to_datetime(your_data['timestamp_column'], errors='coerce')

# Find the start indices where 'relabeled' changes to 2
start_indices = filtered_data.index[(filtered_data['relabeled'] == 1) & (filtered_data['relabeled'].shift(1).isin([2, 0]))]

# Find the end indices where 'relabeled' is 2
end_indices = filtered_data.index[(filtered_data['relabeled'] == 1) & (~filtered_data['relabeled'].shift(-1).isin([1]))]

# Create intervals based on the start and end indices
intervals = []
for start_idx, end_idx in zip(start_indices, end_indices):
    interval_data = filtered_data.loc[start_idx:end_idx]
    
    # Calculate the duration of occurrences where 'relabeled' == 2 in each interval
    duration_seconds = (interval_data['timestamp'].max() - interval_data['timestamp'].min())
    
    intervals.append({'start_time': interval_data['timestamp'].min(), 'end_time': interval_data['timestamp'].max(), 'duration_seconds': duration_seconds})

# Convert the list of intervals to a DataFrame
intervals_df = pd.DataFrame(intervals) / 10**9
print(intervals_df)
#print(intervals_df)


    start_time  end_time  duration_seconds
0        36.44     74.44             38.00
1      9028.46   9066.46             38.00
2     14724.60  14762.60             38.00
3     18458.00  18496.00             38.00
4     32503.94  32541.94             38.00
5     32850.02  32884.74             34.72
6     36539.90  36577.90             38.00
7     39896.54  39934.54             38.00
8     44751.48  44789.48             38.00
9     48679.26  48717.26             38.00
10    49247.18  49285.18             38.00
11    53063.36  53101.36             38.00


In [68]:
# Convert 'timestamp_column' to datetime if it's not already
#filtered_data['datetime'] = pd.to_datetime(your_data['timestamp_column'], errors='coerce')

# Find the start indices where 'relabeled' changes to 2
start_indices = filtered_data.index[(filtered_data['relabeled'] == 0) & (filtered_data['relabeled'].shift(1).isin([2, 0]))]

# Find the end indices where 'relabeled' is 2
end_indices = filtered_data.index[(filtered_data['relabeled'] == 0) & (~filtered_data['relabeled'].shift(-1).isin([0]))]

# Create intervals based on the start and end indices
intervals = []
for start_idx, end_idx in zip(start_indices, end_indices):
    interval_data = filtered_data.loc[start_idx:end_idx]
    
    # Calculate the duration of occurrences where 'relabeled' == 2 in each interval
    duration_seconds = (interval_data['timestamp'].max() - interval_data['timestamp'].min())
    
    intervals.append({'start_time': interval_data['timestamp'].min(), 'end_time': interval_data['timestamp'].max(), 'duration_seconds': duration_seconds})

# Convert the list of intervals to a DataFrame
intervals_df = pd.DataFrame(intervals) / 10**9
print(intervals_df)
#print(intervals_df)


    start_time  end_time  duration_seconds
0         0.02     36.42             36.40
1         0.04    396.06            396.02
2         0.06   4781.12           4781.06
3         0.08   7948.50           7948.42
4         0.10   9028.44           9028.34
5         0.12  14724.58          14724.46
6         0.14  18457.98          18457.84
7         0.16  32503.92          32503.76
8         0.18  32839.98          32839.80
9         0.20  35680.74          35680.54
10        0.22  36539.88          36539.66
11        0.24  39896.52          39896.28
12        0.26  44751.46          44751.20
13        0.28  46789.62          46789.34
14        0.30  48409.60          48409.30
15        0.32  48679.24          48678.92
16        0.34  49247.16          49246.82
17        0.36  53063.34          53062.98
18        0.38  53223.70          53223.32


In [9]:
data_a=filtered_data[filtered_data["relabeled"]==0][:11000]
data_b=filtered_data[filtered_data["relabeled"]==1][:11000]
data_c=filtered_data[filtered_data["relabeled"]==2][:11000]

In [10]:
data=pd.concat([data_a,data_b,data_c],axis=0)

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()
data[["acc x","acc y","acc z","gyro x","gyro y","gyro z"]]=scaler.fit_transform(data[["acc x","acc y","acc z","gyro x","gyro y","gyro z"]])

In [12]:
new_data=data.drop(['timestamp','datetime','user yes/no','compulsive', 'urge', 'tense', 'ignore'],axis=1)

In [13]:
features = new_data.drop(['relabeled'],axis=1)

In [14]:
# choosing relabeled as label value
label = new_data['relabeled']  # Create a 'label' column for majority voting

In [15]:
# Split data into train and test sets
train_data, test_data, y_train, y_test = train_test_split(features, label, random_state=8, test_size=0.4, stratify=label)
print(f"Number of train DataFrames: {len(y_train)}")
print(f"Number of test DataFrames: {len(y_test)}")

Number of train DataFrames: 19800
Number of test DataFrames: 13200


In [17]:
label_counts = y_train.value_counts()
count_label_0 = label_counts.get(0, 0)  # Count of label 0, default to 0 if not found
count_label_1 = label_counts.get(1, 0)  # Count of label 1, default to 0 if not found
count_label_2 = label_counts.get(2, 0)  # Count of label 2, default to 0 if not found
if(count_label_0 & count_label_1 & count_label_2):
    print(count_label_0)
    print(count_label_1)
    print(count_label_2)

6600
6600
6600


### Creating sliding windows and majority voting

In [18]:
# Define window size
window_size = 50
stride = 10
train_data_val = train_data.values

num_windows = (len(train_data) - window_size + 1)
# Initialize lists to store windowed data and labels
train_windows = []
train_labels = []
print(num_windows)


19751


In [19]:
for i in range(0, num_windows, stride):
    window = train_data_val[i:i+window_size]  # Select only sensor axes columns
    label_window = y_train.iloc[i:i+window_size]  # Select the label column for majority voting
    majority_label = np.bincount(label_window).argmax()  # Majority voting
    train_windows.append(window)  # Convert window to numpy array
    train_labels.append(majority_label)

In [20]:

train_windows = np.array(train_windows)
train_labels = np.array(train_labels)
# Flatten the sensor axes data in each window
num_trainsamp, num_fea_trainsamp = train_windows.shape[0], np.prod(train_windows.shape[1:])
train_windows_reshaped = train_windows.reshape(num_trainsamp, num_fea_trainsamp)
print(train_windows_reshaped)
print(len(np.unique(train_labels)))


[[0.6081053  0.47699533 0.58865512 ... 0.43383188 0.52902211 0.3251766 ]
 [0.52172102 0.50587105 0.69188526 ... 0.41825351 0.46609407 0.30285654]
 [0.64725728 0.49212779 0.72617945 ... 0.38544478 0.5517009  0.26323333]
 ...
 [0.66441752 0.47701186 0.68898879 ... 0.42134431 0.50589423 0.29144055]
 [0.61581074 0.46781661 0.68824729 ... 0.42649565 0.6021107  0.4473226 ]
 [0.67637136 0.49340124 0.60686811 ... 0.38388161 0.53227798 0.31611548]]
3


In [21]:
print(train_windows_reshaped.shape)
print(train_labels.shape)

(1976, 300)
(1976,)


In [22]:
# Generate windows and labels using a sliding window approach
window = []
label_window = []
test_windows = []
test_labels = []
for i in range(0,len(test_data)-window_size+1,stride):
    window = test_data.iloc[i:i+window_size]  # Select only sensor axes columns
    test_windows.append(window)
    label_window = y_test.iloc[i:i+window_size]  # Select the label column for majority voting
    majority_label = np.bincount(label_window).argmax()  # Majority voting
    test_labels.append(majority_label)

In [23]:
test_windows = np.array(test_windows)
test_labels = np.array(test_labels)
# Flatten the sensor axes data in each window
num_trainsamp, num_fea_trainsamp = test_windows.shape[0], np.prod(test_windows.shape[1:])
test_windows_reshaped = test_windows.reshape(num_trainsamp, num_fea_trainsamp)
print(test_windows_reshaped)
print(len(np.unique(test_labels)))

[[0.57345162 0.47601956 0.61412087 ... 0.41086399 0.39014258 0.26220947]
 [0.57540922 0.47418382 0.61569655 ... 0.42267656 0.55776355 0.31002354]
 [0.55420884 0.46935466 0.65107981 ... 0.38706124 0.57589536 0.22678406]
 ...
 [0.62305802 0.48405714 0.59655668 ... 0.45431291 0.6370832  0.31934062]
 [0.57153568 0.50403531 0.66361572 ... 0.43640755 0.53620747 0.31329987]
 [0.53829814 0.53884826 0.73725554 ... 0.43988915 0.50432244 0.22581139]]
3


Random Forest Classifier

In [24]:
clf = RandomForestClassifier(n_estimators=10,max_depth=9, random_state=6, class_weight='balanced',min_samples_split=12)
clf.fit(train_windows_reshaped, train_labels)

In [25]:
# Make predictions on the test set with SVM
y_pred = clf.predict(test_windows_reshaped)

# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred)
print(accuracy)

report = classification_report(test_labels, y_pred)
# Print the report
print(report)

0.3366261398176292
              precision    recall  f1-score   support

           0       0.36      0.39      0.37       476
           1       0.35      0.32      0.34       453
           2       0.29      0.29      0.29       387

    accuracy                           0.34      1316
   macro avg       0.33      0.33      0.33      1316
weighted avg       0.34      0.34      0.34      1316



## Naives Bayes

In [26]:
# Create and train a Gaussian Naive Bayes classifier
# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights /= class_weights.sum()
# Create a dictionary to map class labels to their corresponding weights
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
NBclf = GaussianNB(priors=class_weights)
NBclf.fit(train_windows_reshaped, train_labels)

# Cross-validation
cv_scores = cross_val_score(NBclf, train_windows_reshaped, train_labels, cv=3)
print("Cross-Validation Scores:", cv_scores)

Cross-Validation Scores: [0.40060698 0.34446131 0.39513678]


In [27]:
# Evaluate the model
y_pred = NBclf.predict(test_windows_reshaped)
accuracy = accuracy_score(test_labels, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')
report = classification_report(test_labels, y_pred)
# Print the report
print(report)

Test Accuracy: 0.3556
              precision    recall  f1-score   support

           0       0.39      0.49      0.44       476
           1       0.36      0.25      0.29       453
           2       0.30      0.32      0.31       387

    accuracy                           0.36      1316
   macro avg       0.35      0.35      0.35      1316
weighted avg       0.35      0.36      0.35      1316



## Logistic Regression

In [28]:
lclf = LogisticRegression(random_state=42)
lclf.fit(train_windows_reshaped, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# Make predictions on the test set with logistic regression
y_pred = lclf.predict(test_windows_reshaped)

# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred)
print(accuracy)

report = classification_report(test_labels, y_pred)
# Print the report
print(report)

0.3670212765957447
              precision    recall  f1-score   support

           0       0.40      0.50      0.44       476
           1       0.37      0.40      0.38       453
           2       0.29      0.17      0.21       387

    accuracy                           0.37      1316
   macro avg       0.35      0.35      0.34      1316
weighted avg       0.35      0.37      0.35      1316



In [31]:
import xgboost as xgb


In [32]:


xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(train_windows_reshaped, train_labels)

# Predictions
xgb_predictions = xgb_classifier.predict(test_windows_reshaped)


In [33]:
# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred)
print(accuracy)

report = classification_report(test_labels, y_pred)
# Print the report
print(report)

0.3670212765957447
              precision    recall  f1-score   support

           0       0.40      0.50      0.44       476
           1       0.37      0.40      0.38       453
           2       0.29      0.17      0.21       387

    accuracy                           0.37      1316
   macro avg       0.35      0.35      0.34      1316
weighted avg       0.35      0.37      0.35      1316

