<a href="https://colab.research.google.com/github/torta211/DataMining/blob/master/DataMiningInSmartSystems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Clone data

In [1]:
!git clone --single-branch --branch data-only https://github.com/torta211/DataMining
%cd DataMining

Cloning into 'DataMining'...
remote: Enumerating objects: 102, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 102 (delta 2), reused 102 (delta 2), pack-reused 0[K
Receiving objects: 100% (102/102), 666.32 KiB | 10.58 MiB/s, done.
Resolving deltas: 100% (2/2), done.
/content/DataMining


Tthe possible class labels assigned for every sensor observation are:
- Sitting on bed
- Sitting on chair
- Lying on bed
- Ambulating, where ambulating includes standing, walking around the room.

The content of the file is as follows:

Comma separated values (CSV) format.
 - Column 1: Time in seconds
 - Column 2: Acceleration reading in G for frontal axis
 - Column 3: Acceleration reading in G for vertical axis
 - Column 4: Acceleration reading in G for lateral axis
 - Column 5: Id of antenna reading sensor
 - Column 6: Received signal strength indicator (RSSI)
 - Column 7: Phase
 - Column 8: Frequency
 - Column 9: Label of activity, 1: sit on bed, 2: sit on chair, 3: lying, 4: ambulating
 
In addition, gender of participant is included in the last character of file name eg: d1p33F (F:female).

# Creating dataframes

In [4]:
dfs_s1_male = []
dfs_s1_female = []
dfs_s2_male = []
dfs_s2_female = []
filenames_s1_male = []
filenames_s1_female = []
filenames_s2_male = []
filenames_s2_female = []
s1_base_dir = "/content/DataMining/Datasets_Healthy_Older_People/S1_Dataset"
s2_base_dir = "/content/DataMining/Datasets_Healthy_Older_People/S2_Dataset"
column_names = ["elapsedTime", "aFrontal", "aVertical", "aLateral", "sensorId", "signalStrength", "phase", "frequency", "currentActivity"]

for file_name in os.listdir(s1_base_dir):
  if file_name.endswith('M'):
    filenames_s1_male.append(file_name)
    dfs_s1_male.append(pd.read_csv(os.path.join(s1_base_dir, file_name),
                                   header=None,
                                   names=column_names))
  elif file_name.endswith('F'):
    filenames_s1_female.append(file_name)
    dfs_s1_female.append(pd.read_csv(os.path.join(s1_base_dir, file_name),
                                     header=None,
                                     names=column_names))

for file_name in os.listdir(s2_base_dir):
  if file_name.endswith('M'):
    filenames_s2_male.append(file_name)
    dfs_s2_male.append(pd.read_csv(os.path.join(s2_base_dir, file_name),
                                   header=None,
                                   names=column_names))
  elif file_name.endswith('F'):
    filenames_s2_female.append(file_name)
    dfs_s2_female.append(pd.read_csv(os.path.join(s2_base_dir, file_name),
                                     header=None,
                                     names=column_names))

print(f"dataset s1 has\n - {len(dfs_s1_male)} male participants\n - {len(dfs_s1_female)} female participants")
print(f"dataset s2 has\n - {len(dfs_s2_male)} male participants\n - {len(dfs_s2_female)} female participants")

dataset s1 has
 - 20 male participants
 - 40 female participants
dataset s2 has
 - 5 male participants
 - 22 female participants


In [0]:
datasets = [dfs_s1_male, dfs_s1_female, dfs_s2_male, dfs_s2_female]
dataset_names = ["S1 Male", "S1 Female", "S2 Male", "S2 Female"]

# Data understanding

## Examine time properties

In [0]:
for i in range(0, len(datasets)):
  times = [df.iloc[-1]['elapsedTime'] for df in datasets[i]]
  nums_activities = [df['currentActivity'].nunique() for df in datasets[i]]
  print(f"{dataset_names[i]}:\n"
        f" - average time={sum(times) / len(times)}\n"
        f" - max time={max(times)}\n"
        f" - min time={min(times)}\n"
        f" - average number of activities={sum(nums_activities) / len(nums_activities)}\n"
        f" - min number of activities={min(nums_activities)}\n"
        f" - max number of activities={max(nums_activities)}")

S1 Male:
 - average time=418.6995
 - max time=730.25
 - min time=240.0
 - average number of activities=4.0
 - min number of activities=4
 - max number of activities=4
S1 Female:
 - average time=272.6775
 - max time=666.5
 - min time=1.75
 - average number of activities=3.325
 - min number of activities=1
 - max number of activities=4
S2 Male:
 - average time=711.0
 - max time=981.75
 - min time=490.0
 - average number of activities=3.8
 - min number of activities=3
 - max number of activities=4
S2 Female:
 - average time=655.5990909090909
 - max time=1739.4
 - min time=374.75
 - average number of activities=3.727272727272727
 - min number of activities=3
 - max number of activities=4


In [0]:
timediffs_1 = []
for i in range(0, len(datasets[0])):
  timediffs_1.append(datasets[0][i]['elapsedTime'].diff()[1:])
for i in range(0, len(datasets[1])):
  timediffs_1.append(datasets[1][i]['elapsedTime'].diff()[1:])
timediffs_2 = []
for i in range(0, len(datasets[2])):
  timediffs_2.append(datasets[2][i]['elapsedTime'].diff()[1:])
for i in range(0, len(datasets[2])):
  timediffs_2.append(datasets[2][i]['elapsedTime'].diff()[1:])

In [0]:
timediff_1 = pd.concat(timediffs_1)
timediff_2 = pd.concat(timediffs_2)

In [0]:
mean_1 = timediff_1.mean()
dev_1 = timediff_1.std()
max_1 = timediff_1.max()
print(mean_1, dev_1, max_1)
mean_2 = timediff_2.mean()
dev_2 = timediff_2.std()
max_2 = timediff_2.max()
print(mean_2, dev_2, max_2)

0.36780531074729217 2.43787601825473 185.15
0.8878621378621377 5.1153005215212985 224.0


## Data to plot

In [0]:
data_to_plot =datasets[0][0]
sitting_on_bed = data_to_plot[data_to_plot["currentActivity"] == 1]
sitting_on_chair = data_to_plot[data_to_plot["currentActivity"] == 2]
lying_on_bed = data_to_plot[data_to_plot["currentActivity"] == 3]
ambulating = data_to_plot[data_to_plot["currentActivity"] == 4]

## Plot accelerations

In [14]:
fig = go.Figure(data=[go.Scatter3d(x=sitting_on_bed["aFrontal"],
                                   y=sitting_on_bed["aVertical"],
                                   z=sitting_on_bed["aLateral"],
                                   mode='markers',
                                   marker=dict(size=6,
                                               color='blue'),
                                   name="Sitting on bed"),
                      go.Scatter3d(x=sitting_on_chair["aFrontal"],
                                   y=sitting_on_chair["aVertical"],
                                   z=sitting_on_chair["aLateral"],
                                   mode='markers',
                                   marker=dict(size=6,
                                               color='purple'),
                                   name="Sitting on chair"),
                      go.Scatter3d(x=lying_on_bed["aFrontal"],
                                   y=lying_on_bed["aVertical"],
                                   z=lying_on_bed["aLateral"],
                                   mode='markers',
                                   marker=dict(size=6,
                                               color='yellow'),
                                   name="Lying on bed"),
                      go.Scatter3d(x=ambulating["aFrontal"],
                                   y=ambulating["aVertical"],
                                   z=ambulating["aLateral"],
                                   mode='markers',
                                   marker=dict(size=6,
                                               color='red'),
                                   name="ambulating")])
fig.update_layout(showlegend=True,
                  scene = dict(
                      xaxis_title='frontal acceleration',
                      yaxis_title='vertical acceleration',     
                      zaxis_title='lateral acceleration'),
                  width=1000)

fig.show()

## Plot sensor Id and signal strength

In [13]:
fig = go.Figure(data=[go.Scatter(x=sitting_on_bed["sensorId"],
                                 y=sitting_on_bed["signalStrength"],
                                 mode='markers',
                                 marker=dict(size=6,
                                             color='blue'),
                                 name="Sitting on bed"),
                      go.Scatter(x=sitting_on_chair["sensorId"],
                                 y=sitting_on_chair["signalStrength"],
                                 mode='markers',
                                 marker=dict(size=6,
                                             color='purple'),
                                 name="Sitting on chair"),
                      go.Scatter(x=ambulating["sensorId"],
                                 y=ambulating["signalStrength"],
                                 mode='markers',
                                 marker=dict(size=6,
                                             color='red'),
                                 name="Ambulating")])
fig.update_layout(showlegend=True,
                  xaxis_title='sensor id',
                  yaxis_title='signal strength',     
                  width=1000)
fig.show()

## Plot Signal phase and frequency

In [0]:
fig = go.Figure(data=[go.Scatter(x=sitting_on_bed["phase"],
                                 y=sitting_on_bed["frequency"],
                                 mode='markers',
                                 marker=dict(size=6,
                                             color='blue'),
                                 name="Sitting on bed"),
                      go.Scatter(x=sitting_on_chair["phase"],
                                 y=sitting_on_chair["frequency"],
                                 mode='markers',
                                 marker=dict(size=6,
                                             color='purple'),
                                 name="Sitting on chair"),
                      go.Scatter(x=ambulating["phase"],
                                 y=ambulating["frequency"],
                                 mode='markers',
                                 marker=dict(size=6,
                                             color='red'),
                                 name="Ambulating"),
                      go.Scatter(x=lying_on_bed["phase"],
                                 y=lying_on_bed["frequency"],
                                 mode='markers',
                                 marker=dict(size=6,
                                             color='yellow'),
                                 name="Lying on bed")])
fig.update_layout(showlegend=True,
                  xaxis_title='phase',
                  yaxis_title='frequency',     
                  width=1000)
fig.show()

# Data split

## Shuffle all data and split with given ratio

In [0]:
df_all = pd.concat(datasets[0] + datasets[1] + datasets[2] + datasets[3])
data_all = df_all.drop('currentActivity', axis=1)
labels_all = df_all['currentActivity']

In [0]:
train_set, test_set,\
train_labels, test_labels = train_test_split(data_all, labels_all, test_size=0.30,
                                             random_state=42)

## Split by gender

In [0]:
df_male = pd.concat(datasets[0] + datasets[2])
df_female = pd.concat(datasets[1] + datasets[3])
train_set = df_female.drop('currentActivity', axis=1)
train_labels = df_female['currentActivity']
test_set = df_male.drop('currentActivity', axis=1)
test_labels = df_male['currentActivity']

## Split by gender 90-10

In [0]:
df_male = pd.concat(datasets[0] + datasets[2])
df_female = pd.concat(datasets[1] + datasets[3])
x_female = df_female.drop('currentActivity', axis=1)
y_female = df_female['currentActivity']
x_male = df_male.drop('currentActivity', axis=1)
y_male = df_male['currentActivity']

x_male_big, x_male_small,\
y_male_big, y_male_small = train_test_split(x_male, y_male, test_size=0.10,
                                            random_state=42)

x_female_big, x_female_small,\
y_female_big, y_female_small = train_test_split(x_female, y_female, test_size=0.10,
                                            random_state=42)

train_set = pd.concat([x_female_big, x_male_small])
train_labels = pd.concat([y_female_big, y_male_small])
test_set = pd.concat([x_male_big, x_female_small])
test_labels = pd.concat([y_male_big, y_female_small])

## Split by room

In [0]:
df1 = pd.concat(datasets[0] + datasets[1])
df2 = pd.concat(datasets[2] + datasets[3])
train_set = df1.drop('currentActivity', axis=1)
train_labels = df1['currentActivity']
test_set = df2.drop('currentActivity', axis=1)
test_labels = df2['currentActivity']

## Split by room 90-10

In [0]:
df1 = pd.concat(datasets[0] + datasets[1])
df2 = pd.concat(datasets[2] + datasets[3])
x1 = df1.drop('currentActivity', axis=1)
y1 = df1['currentActivity']
x2 = df2.drop('currentActivity', axis=1)
y2 = df2['currentActivity']

x1_big, x1_small,\
y1_big, y1_small = train_test_split(x1, y1, test_size=0.10,
                                            random_state=42)

x2_big, x2_small,\
y2_big, y2_small = train_test_split(x2, y2, test_size=0.10,
                                            random_state=42)

train_set = pd.concat([x1_big, x2_small])
train_labels = pd.concat([y1_big, y2_small])
test_set = pd.concat([x2_big, x1_small])
test_labels = pd.concat([y2_big, y1_small])

## Drop only time

In [0]:
train_set = train_set.drop('elapsedTime', axis=1)
test_set = test_set.drop('elapsedTime', axis=1)

## Drop time and phase and frequency

In [0]:
train_set = train_set[["aFrontal", "aVertical", "aLateral", "sensorId", "signalStrength",]]
test_set = test_set[["aFrontal", "aVertical", "aLateral", "sensorId", "signalStrength",]]

## Only keep acceleration

In [0]:
train_set = train_set[["aFrontal", "aVertical", "aLateral"]]
test_set = test_set[["aFrontal", "aVertical", "aLateral"]]

# Train a random forest

In [33]:
rfc = RandomForestClassifier(n_estimators=100,
                             random_state=42,
                             class_weight='balanced')
rfc.fit(train_set, train_labels)

rfc_predictions = rfc.predict(test_set)
print(metrics.classification_report(test_labels, rfc_predictions))
print('Accuracy: ', metrics.accuracy_score(test_labels, rfc_predictions))

              precision    recall  f1-score   support

           1       0.97      0.98      0.97      4915
           2       0.96      0.97      0.97      1496
           3       1.00      1.00      1.00     15433
           4       0.95      0.78      0.86       695

    accuracy                           0.99     22539
   macro avg       0.97      0.93      0.95     22539
weighted avg       0.99      0.99      0.99     22539

Accuracy:  0.9874439859798572


# Fine tune a random forest

In [0]:
rfc = RandomForestClassifier()
params = {
    'n_estimators': [50, 100, 250],
    'class_weight': ['balanced', 'balanced_subsample'],
    'random_state': [42],
    'max_depth': [2, 4, 7, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 8],
    'max_features': [1, 2, 3, 4, 5]    
}

In [27]:
gridsearch = GridSearchCV(estimator = rfc,
                          param_grid = params,
                          scoring = 'accuracy',
                          cv = 3,
                          verbose = 1,
                          refit=True,
                          n_jobs = -1)
gridsearch.fit(train_set, train_labels)

rfc = gridsearch.best_estimator_
rfc_predictions = rfc.predict(test_set)
print(metrics.classification_report(test_labels, rfc_predictions))
print('Accuracy: ', metrics.accuracy_score(test_labels, rfc_predictions))

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  8.7min finished


              precision    recall  f1-score   support

           1       0.97      0.95      0.96      4915
           2       0.91      0.97      0.94      1496
           3       1.00      1.00      1.00     15433
           4       0.82      0.79      0.81       695

    accuracy                           0.98     22539
   macro avg       0.93      0.93      0.93     22539
weighted avg       0.98      0.98      0.98     22539

Accuracy:  0.9809663250366032


# Inference on one file

In [0]:
file_data = dfs_s2_female[1]
rfc_predictions = rfc.predict(file_data[["aFrontal", "aVertical", "aLateral", "sensorId", "signalStrength",]])

In [0]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=file_data['elapsedTime'], y=file_data['currentActivity'],
                         mode='lines',
                         name="groundtruth"))
fig.add_trace(go.Scatter(x=file_data['elapsedTime'], y=rfc_predictions,
                         mode='lines',
                         name="prediction"))
fig.update_layout(yaxis_title="1: Sitting on bed<br>2: Sitting on chair<br>3: Lying on bed<br>4: Ambulating",
                  xaxis_title="Elapsed Time [sec]",
                  width=1000)
fig.show()

# Junk