## Libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from influxdb import InfluxDBClient, DataFrameClient

## Load Data

In [4]:
influxClient = DataFrameClient(host='css20.dmz.teco.edu', port=80, database='browser',username="user", password="pass")
data = influxClient.query("select * FROM devicemotion")["devicemotion"]

In [5]:
data.columns

Index(['acceleration.x', 'acceleration.y', 'acceleration.z',
       'accelerationIncludingGravity.x', 'accelerationIncludingGravity.y',
       'accelerationIncludingGravity.z', 'label', 'mobile',
       'rotationRate.alpha', 'rotationRate.beta', 'rotationRate.gamma',
       'subject', 'useragent'],
      dtype='object')

In [6]:
data.head(2)

Unnamed: 0,acceleration.x,acceleration.y,acceleration.z,accelerationIncludingGravity.x,accelerationIncludingGravity.y,accelerationIncludingGravity.z,label,mobile,rotationRate.alpha,rotationRate.beta,rotationRate.gamma,subject,useragent
2020-05-18 15:38:31.330200+00:00,0.0514,-0.4303,1.4837,0.038307,5.439626,9.825804,testing,UnknownPhone,-13.115,-43.004999,0.488,10b5c,Mozilla/5.0 (Linux; Android 7.0; FRD-L19) Appl...
2020-05-18 15:38:31.348400+00:00,0.6009,-0.1249,1.0312,0.172383,5.477933,9.174581,testing,UnknownPhone,-2.623,-6.527,-2.379,10b5c,Mozilla/5.0 (Linux; Android 7.0; FRD-L19) Appl...


In [7]:
data.shape

(346487, 13)

## Data Cleaning

In [8]:
data = data.loc['2020-07-01':].dropna()
data.shape

(179951, 13)

## Preprocessing

In [9]:
# Extract labels and subjects
labels = data['label'].unique()[1:] # drop "testing"-label
subjects = data['subject'].unique()

# Group data according to labels and subjects
grouped_data = {}
for subject in subjects:
    grouped_data[subject] = {}
    subject_data = data[data['subject'] == subject]
    for label in labels:
        grouped_data[subject][label] = subject_data[subject_data["label"] == label]

## Windowing

In [10]:
def minmax(data):
    return np.max(data)-np.min(data)

In [11]:
# Prepare aggregation
numeric_columns = {}
for col, dtype in zip(data.columns, data.dtypes):
    if dtype == 'float64':
        numeric_columns[col] = ['max', 'mean', 'min',"std", "var",minmax]

In [12]:
aggregatedDict = {}
for subject in subjects:
    aggregatedDict[subject] = {}
    for label in labels:
        # check if part is empty
        if(not grouped_data[subject][label].empty):
            # use resample for frequency conversion 
            aggregatedDict[subject][label] = grouped_data[subject][label].resample('1s').agg(numeric_columns).dropna()

## Data Transformation

In [13]:
# groups for Leave-One-Subject-Out-CV
groups = []

# Transform data into ungrouped and flat table
data = pd.DataFrame()
for subject in subjects:
    for label in labels:
        if label in aggregatedDict[subject]:
            curr_data = aggregatedDict[subject][label]
            curr_labels = len(curr_data)*[label]
            curr_data["label"] = curr_labels
            data = data.append(curr_data)
            groups.extend(len(curr_data)*[subject])
# append groups
data["group"] = groups        

# Train-Test Split

In [14]:
# Seed
np.random.seed(1234)

# Draw random Training-Data-Indices
number_subjects = len(subjects)
size = int(0.8*number_subjects)

# Extract Train/Test-Subjects
train_subjects = np.random.choice(subjects, size=size, replace=False)
test_subjects = list(set(subjects)-set(train_subjects))

# Setup train and test data
train_data = pd.DataFrame()
test_data = pd.DataFrame()

for train_subject in train_subjects:
    train_data= train_data.append(data.loc[data["group"] == train_subject])
    
for test_subject in test_subjects:
    test_data= test_data.append(data.loc[data["group"] == test_subject])
    
# finalize train / test data
x_train = train_data.drop(["group", "label"], axis="columns")
x_test = test_data.drop(["group", "label"], axis="columns")
y_train = train_data["label"]
y_test = test_data["label"]

## Train-Test Split (Leave-one-Subject-out)

# Excercise 4 Creating a predictor

In [28]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.feature_selection import RFE

## 4.1 10-Fold-CV

In [24]:
# MLP
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=100)
mlp_res = cross_validate(mlp, x_train, y_train, n_jobs=2, cv=10,
                         return_estimator=False, return_train_score=True)

# RF
rf = RandomForestClassifier(n_estimators=100, max_depth=3)
rf_res = cross_validate(rf, x_train, y_train, n_jobs=2, cv=10,
                         return_estimator=False, return_train_score=True)

print("MLP 10-Fold-CV-Accuracy: ", mlp_res["test_score"].mean())
print("Random Forest 10-Fold-CV-Accuracy: ", rf_res["test_score"].mean())

MLP 10-Fold-CV-Accuracy:  0.7536022893053529
Random Forest 10-Fold-CV-Accuracy:  0.7250084165637976


## 4.2 Leave-One-Subject-Out-CV

In [27]:
logo = LeaveOneGroupOut()
logo.get_n_splits(groups=train_data["group"].values)

# MLP
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=100)
mlp_res = cross_validate(mlp, x_train, y_train, cv=logo, n_jobs=2, groups = train_data["group"].values,
                         return_estimator=False, return_train_score=True)

# RF
rf = RandomForestClassifier(n_estimators=100, max_depth=3)
rf_res = cross_validate(rf, x_train, y_train, cv=logo, n_jobs=2, groups = train_data["group"].values,
                        return_estimator=False, return_train_score=True)

print("MLP LOGO-CV-Accuracy: ", mlp_res["test_score"].mean())
print("Random Forest LOGO-CV-Accuracy: ", rf_res["test_score"].mean())

MLP LOGO-CV-Accuracy:  0.6974314865035531
Random Forest LOGO-CV-Accuracy:  0.7129975140246855


## 4.3 Recursive Feature Elimination

In [45]:
rfe = RFE(rf, n_features_to_select=10, step=1, verbose=0)
rfe.fit(x_train,y_train)

# evaluate
selected_features = x_train.columns[rfe.support_]
x_train_new = x_train[selected_features]
rf_res_new = cross_validate(rf, x_train_new, y_train, n_jobs=2, cv=10,
                         return_estimator=False, return_train_score=True)
print("Random Forest - RFE -  10-Fold-CV-Accuracy: ", rf_res_new["test_score"].mean())

Random Forest - RFE -  10-Fold-CV-Accuracy:  0.7182246661429694


# Classifier

## 1. MLP

In [56]:
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
# set seeds
np.random.seed(4321)

# define space
mlp_space = {'hidden_layer_sizes' : hp.quniform('hidden_layer_sizes',5,50,1),
             'max_iter' : hp.quniform("max_iter",10,500,1)}
max_iters = 10

def mlp_opt(params):
    from sklearn.model_selection import LeaveOneGroupOut
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import cross_validate
    
    params["hidden_layer_sizes"] = int(params["hidden_layer_sizes"])
    params["max_iter"] = int(params["max_iter"])
    
    mlp = MLPClassifier(hidden_layer_sizes= params["hidden_layer_sizes"], max_iter= params["max_iter"])
    mlp = cross_validate(mlp, x_train_new, y_train, cv=10, n_jobs=2,
                         return_estimator=False, return_train_score=True)
    
    score = -1* mlp["test_score"].mean()
    return {'loss': score, 'status': STATUS_OK, 'model': None, 'params': params}

### optimization
trials = Trials()
best_mlp = fmin(mlp_opt, mlp_space, algo=tpe.suggest, max_evals= max_iters,trials=trials,
                rstate= np.random.RandomState(1234))
print('best: ')
### Extract params
best_MLP_params = trials.results[np.argmin([r['loss'] for r in trials.results])]['params']
print(best_MLP_params)
# Extract Model
MLP_model = MLPClassifier(hidden_layer_sizes = best_MLP_params["hidden_layer_sizes"], max_iter = best_MLP_params["max_iter"])
MLP_model.fit(x_train_new, y_train)

100%|█████████████████████████████████████████████████| 5/5 [01:08<00:00, 14.36s/trial, best loss: -0.7243126472898666]
best: 
{'hidden_layer_sizes': 19, 'max_iter': 438}


MLPClassifier(hidden_layer_sizes=19, max_iter=438)

## 2. RF

In [57]:
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
# set seeds
np.random.seed(4321)

# define space
rf_space = {'n_estimators' : hp.quniform('n_estimators',5,200,1),
             'max_depth' : hp.quniform("max_depth",1,4,1)}
max_iters = 10

def rf_opt(params):
    from sklearn.model_selection import LeaveOneGroupOut
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import cross_validate
    
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    
    rf = RandomForestClassifier(n_estimators=params["n_estimators"], max_depth=params["max_depth"])
    rf = cross_validate(rf, x_train_new, y_train, cv=10, n_jobs=2,
                         return_estimator=False, return_train_score=True)
    
    score = -1* rf["test_score"].mean()
    return {'loss': score, 'status': STATUS_OK, 'model': None, 'params': params}

### optimization
trials = Trials()
best_rf = fmin(rf_opt, rf_space, algo=tpe.suggest, max_evals= max_iters,trials=trials,
                rstate= np.random.RandomState(1234))
print('best: ')
### Extract params
best_RF_params = trials.results[np.argmin([r['loss'] for r in trials.results])]['params']
print(best_RF_params)
# Extract Model
RF_model = RandomForestClassifier(n_estimators = best_RF_params["n_estimators"], max_depth = best_RF_params["max_depth"])
RF_model.fit(x_train_new, y_train)

100%|███████████████████████████████████████████████| 10/10 [00:23<00:00,  1.98s/trial, best loss: -0.7859106722028952]
best: 
{'max_depth': 4, 'n_estimators': 64}


RandomForestClassifier(max_depth=4, n_estimators=64)