## В домашньому завданні до даного модулю ви потренуєтесь робити тестове завдання для влаштування на роботу. За даними акселерометра з мобільного телефону потрібно класифікувати, якою діяльністю займається людина: йде, стоїть, біжить чи йде по сходах. Знайти датасет ви можете за [посиланням](https://drive.google.com/file/d/1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO/view?usp=share_link).

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## Load data

In [2]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile

URL = "https://drive.usercontent.google.com/download?id=1nzrtQpfaHL0OgJ_eXzA7VuEj7XotrSWO&export=download"

FILE_NAME = Path('content/data.zip')
FOLDER_NAME = Path('content/data')

In [None]:
urlretrieve(URL, FILE_NAME)

with zipfile.ZipFile(FILE_NAME, 'r') as zipfp:
    zipfp.extractall(FOLDER_NAME.parent)

## Create dataset

In [3]:
def read_csv(path):
    df = pd.read_csv(path, header=0)
    if len(df) == 0:
        print("Error: file not found")
        raise ValueError
    
    desribe_df = df.describe()
    var_df = df.var()

    for col in df.columns:
        df[f'{col}_max'] = desribe_df[col]['max']
        df[f'{col}_mean'] = desribe_df[col]['mean']
        df[f'{col}_min'] = desribe_df[col]['min']
        df[f'{col}_var'] = var_df[col]
        df[f'{col}_std'] = desribe_df[col]['std']
        df[f'{col}_median'] = desribe_df[col]['50%']
        df[f'{col}_iqr'] = desribe_df[col]['75%'] - desribe_df[col]['25%']


    df['activity'] = path.parent.name

    
    return df

In [4]:
df = pd.concat([read_csv(file) for file in FOLDER_NAME.glob('**/*.csv')])

In [5]:
df.isna().sum()

accelerometer_X           0
accelerometer_Y           0
accelerometer_Z           0
accelerometer_X_max       0
accelerometer_X_mean      0
accelerometer_X_min       0
accelerometer_X_var       0
accelerometer_X_std       0
accelerometer_X_median    0
accelerometer_X_iqr       0
accelerometer_Y_max       0
accelerometer_Y_mean      0
accelerometer_Y_min       0
accelerometer_Y_var       0
accelerometer_Y_std       0
accelerometer_Y_median    0
accelerometer_Y_iqr       0
accelerometer_Z_max       0
accelerometer_Z_mean      0
accelerometer_Z_min       0
accelerometer_Z_var       0
accelerometer_Z_std       0
accelerometer_Z_median    0
accelerometer_Z_iqr       0
activity                  0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 193860 entries, 0 to 29
Data columns (total 25 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   accelerometer_X         193860 non-null  float64
 1   accelerometer_Y         193860 non-null  float64
 2   accelerometer_Z         193860 non-null  float64
 3   accelerometer_X_max     193860 non-null  float64
 4   accelerometer_X_mean    193860 non-null  float64
 5   accelerometer_X_min     193860 non-null  float64
 6   accelerometer_X_var     193860 non-null  float64
 7   accelerometer_X_std     193860 non-null  float64
 8   accelerometer_X_median  193860 non-null  float64
 9   accelerometer_X_iqr     193860 non-null  float64
 10  accelerometer_Y_max     193860 non-null  float64
 11  accelerometer_Y_mean    193860 non-null  float64
 12  accelerometer_Y_min     193860 non-null  float64
 13  accelerometer_Y_var     193860 non-null  float64
 14  accelerometer_Y_std     19386

## Використайте алгоритми SVM та випадковий ліс з бібліотеки scikit-learn. Як характеристики можете брати показники з акселерометра, проте щоб покращити результати роботи алгоритмів, спочатку можна підготувати наш датасет і розрахувати часові ознаки (time domain features). Більше ці характеристики описані в даній [статті](https://drive.google.com/file/d/1-18YEmp0YjV3hN9iI8J1i_FWd55HFwOK/view?usp=share_link).

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
X = df.drop(columns='activity')
y = df['activity']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((155088, 24), (38772, 24))

In [10]:
fit_results = {}
df_metrics_score_results = pd.DataFrame(columns=['model', 'param', 'accuracy', 'precision', 'recall', 'f1'])

In [11]:
def metrics_score(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return accuracy, precision, recall, f1

## SVM

In [12]:
from itertools import product
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
# c_params = [1., 100., 1000.]
c_params = [1.0, 10.0, 100.0]

for kernel, c in product(kernels, c_params):
    model_svm = SVC(C=c, kernel=kernel)
    model_svm.fit(X_train, y_train)

    y_pred = model_svm.predict(X_test)
    accuracy, precision, recall, f1 = metrics_score(y_test, y_pred)
    
    df_metrics_score_results.loc[len(df_metrics_score_results.index)] = ['SVC', f'C: {c} kernel: {kernel},', accuracy, precision, recall, f1]
    fit_results[f'SVC param - C: {c} kernel: {kernel}'] = y_pred

## Use GridSearchCV

In [44]:
model_svm = SVC()

param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [1., 10., 100.]
}

# grid_search = GridSearchCV(model_svm, param_grid, n_jobs=-1 , cv=5)
grid_search = GridSearchCV(model_svm, param_grid, n_jobs=-1, return_train_score=True)
grid_search.fit(X_train, y_train)

In [26]:
print("Best params:", grid_search.best_params_)

Best params: {'C': 10.0, 'kernel': 'rbf'}


## Random Forest Classifier

In [13]:
n_estimators = [10, 25, 50, 100, 200]

for n_estimator in n_estimators:
    model_rfc = RandomForestClassifier(n_estimators=n_estimator)
    model_rfc.fit(X_train, y_train)

    y_pred = model_rfc.predict(X_test)

    accuracy, precision, recall, f1 = metrics_score(y_test, y_pred)

    df_metrics_score_results.loc[len(df_metrics_score_results.index)] = ['RandomForestClassifier', f'n_estimator: {n_estimator}', accuracy, precision, recall, f1]
    fit_results[f'RandomForestClassifier param - n_estimator: {n_estimator}'] = y_pred

## Use GridSearchCV

In [None]:
model_rfc = RandomForestClassifier()

param_grid = {
    'n_estimators': [10, 25, 50, 100, 200]
}

grid_search = GridSearchCV(model_rfc, param_grid, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print("Best params:", grid_search.best_params_)

## Порівняйте результати роботи обох алгоритмів на різних фічах та різні моделі між собою.

In [20]:
df_metrics_score_results

Unnamed: 0,model,param,accuracy,precision,recall,f1
0,SVC,C: 1.0 kernel: linear,0.993707,0.993557,0.993707,0.99359
1,SVC,C: 10.0 kernel: linear,0.993887,0.993755,0.993887,0.99379
2,SVC,C: 100.0 kernel: linear,0.99381,0.993689,0.99381,0.993728
3,SVC,C: 1.0 kernel: poly,0.972196,0.973655,0.972196,0.962163
4,SVC,C: 10.0 kernel: poly,0.987646,0.987183,0.987646,0.986384
5,SVC,C: 100.0 kernel: poly,0.991824,0.991534,0.991824,0.991485
6,SVC,C: 1.0 kernel: rbf,0.990457,0.990148,0.990457,0.989828
7,SVC,C: 10.0 kernel: rbf,0.99461,0.994505,0.99461,0.994452
8,SVC,C: 100.0 kernel: rbf,0.999149,0.999147,0.999149,0.999145
9,SVC,C: 1.0 kernel: sigmoid,0.798076,0.799526,0.798076,0.797355


## Порівняйте результати роботи обох алгоритмів на різних фічах та різні моделі між собою. Використайте метод classification report для порівняння.

In [15]:
for k, v in fit_results.items():
    print(k)
    print(classification_report(y_test, v))

SVC param - C: 1.0 kernel: linear,
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00      6203
     running       1.00      1.00      1.00     20485
      stairs       0.91      0.84      0.88      1022
     walking       0.99      0.99      0.99     11062

    accuracy                           0.99     38772
   macro avg       0.97      0.96      0.97     38772
weighted avg       0.99      0.99      0.99     38772

SVC param - C: 10.0 kernel: linear,
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00      6203
     running       1.00      1.00      1.00     20485
      stairs       0.91      0.85      0.88      1022
     walking       0.99      0.99      0.99     11062

    accuracy                           0.99     38772
   macro avg       0.97      0.96      0.97     38772
weighted avg       0.99      0.99      0.99     38772

SVC param - C: 100.0 kernel: linear,
              precisio