# Hw5

In [1]:
import pandas as pd
from pathlib import Path
from scipy.stats import kurtosis, skew
from os import listdir

## Завантаження даних

In [2]:

base_path = Path("data")
activity_folders = listdir(base_path)

print(f'Number of records in each activity folder:\n')
for folder in activity_folders:
    print(f'{folder}: {len(listdir(base_path / folder))}')

Number of records in each activity folder:

idle: 1039
running: 3408
stairs: 165
walking: 1850


## Створення датасету

In [3]:

def load_data_from_folder(folder_path, activity_label):
    all_files = folder_path.glob("*.csv")
    data_list = []
    
    for file in all_files:
        df = pd.read_csv(file)
        df['activity'] = activity_label  
        data_list.append(df)
    
    return pd.concat(data_list, ignore_index=True)

all_data = []
for activity in activity_folders:
    folder_path = base_path / activity
    activity_data = load_data_from_folder(folder_path, activity)
    all_data.append(activity_data)


df = pd.concat(all_data, ignore_index=True)
df.head()

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,activity
0,1.000776,4.616021,8.576031,idle
1,0.718261,4.209007,8.446744,idle
2,-0.909797,-0.282516,9.203311,idle
3,5.09965,0.148441,8.418014,idle
4,1.762132,-0.162806,9.251195,idle


## Додавання time domain features

In [4]:

def add_time_domain_features(df):
    features = {}
    
    for axis in ['X', 'Y', 'Z']:
        col = f'accelerometer_{axis}'
        features[f'mean_{axis}'] = df[col].mean()
        features[f'std_{axis}'] = df[col].std()
        features[f'min_{axis}'] = df[col].min()
        features[f'max_{axis}'] = df[col].max()
        features[f'median_{axis}'] = df[col].median()
        features[f'range_{axis}'] = df[col].max() - df[col].min()
        features[f'kurtosis_{axis}'] = kurtosis(df[col])
        features[f'skewness_{axis}'] = skew(df[col])
    
    return pd.Series(features)


grouped = df.groupby(['activity', df.index // 30]).apply(add_time_domain_features, include_groups=False).reset_index()

grouped.head()


Unnamed: 0,activity,level_1,mean_X,std_X,min_X,max_X,median_X,range_X,kurtosis_X,skewness_X,...,kurtosis_Y,skewness_Y,mean_Z,std_Z,min_Z,max_Z,median_Z,range_Z,kurtosis_Z,skewness_Z
0,idle,0,0.178448,1.036361,-0.909797,5.09965,-0.10295,6.009447,15.400041,3.839216,...,10.086172,3.462503,9.605697,0.406903,8.418014,9.80665,9.770737,1.388636,3.481228,-2.245364
1,idle,1,-0.098641,0.125848,-0.320823,0.407014,-0.114922,0.727837,7.508929,2.184006,...,3.235176,0.539408,9.771216,0.025419,9.667787,9.80665,9.77792,0.138863,7.285421,-2.287712
2,idle,2,-0.099918,0.015642,-0.124498,-0.062249,-0.100556,0.062249,-0.072724,0.573616,...,1.269381,-1.208369,9.768503,0.014528,9.739613,9.792285,9.768343,0.052672,-0.75264,-0.092533
3,idle,3,0.400949,0.052889,0.268151,0.560243,0.392649,0.292092,2.061311,0.381131,...,2.331288,-0.814327,9.752541,0.017967,9.725247,9.797073,9.751583,0.071826,-0.150002,0.537257
4,idle,4,0.400949,0.053187,0.268151,0.560243,0.402226,0.292092,1.953499,0.367899,...,2.68506,-0.950313,9.753659,0.018902,9.725247,9.797073,9.751583,0.071826,-0.583786,0.430159


# Розбиття даних на тренувальну та тестову вибірки

In [5]:
from sklearn.model_selection import train_test_split

X = grouped.drop(columns=['activity', 'level_1'])
y = grouped['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


# SVM модель

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Classification Report:
               precision    recall  f1-score   support

        idle       1.00      1.00      1.00       312
     running       1.00      1.00      1.00      1023
      stairs       0.73      0.90      0.81        49
     walking       0.99      0.97      0.98       555

    accuracy                           0.99      1939
   macro avg       0.93      0.97      0.95      1939
weighted avg       0.99      0.99      0.99      1939



# RandomForest модель

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("RandomForest Classification Report:\n", classification_report(y_test, y_pred_rf))


RandomForest Classification Report:
               precision    recall  f1-score   support

        idle       1.00      1.00      1.00       312
     running       1.00      1.00      1.00      1023
      stairs       1.00      0.94      0.97        49
     walking       0.99      1.00      1.00       555

    accuracy                           1.00      1939
   macro avg       1.00      0.98      0.99      1939
weighted avg       1.00      1.00      1.00      1939


## Висновки: загалом обидві моделі показали хороші результати при визначенні активностей idle та running, але при визначенні активностей stairs та walking модель RandomForest виявилась більш ефективною. 