In [12]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier, XGBRegressor
import pickle
from sklearn.model_selection import cross_val_score
import random

## Creating the dataset

In [5]:


def feature_extraction_steps(file_path,directory_path):
  df_file = pd.read_csv(os.path.join(directory_path, file_path),header=1,skip_blank_lines=True,skiprows =[0,1,2,3])
  df_file_info = pd.read_csv(os.path.join(directory_path, file_path),header=None,nrows=4,skip_blank_lines=True)
  activity = df_file_info.iloc[2,1].lower()
  activity = 'run' if 'run' in activity else 'walk'
  actucal_steps = df_file_info.iloc[3,1]

  norm = []
  cols = df_file.columns
  for i, row in df_file.iterrows():
    x = float(row[cols[1]])
    y = float(row[cols[2]])
    z = float(row[cols[3]])
    norm.append((x**2 + y**2 + z**2) ** 0.5)
  norm = np.array(norm)
  df_file['norm'] = norm
  features = []
  cols = df_file.columns
  for col_name in [cols[0],cols[3],cols[4]]:
    vals = df_file[col_name].values
    vals_lag = vals-(np.append(vals[1:],0))
    vals_2lag = vals-(np.append(vals[2:],[0,0]))
    vals_3lag = vals-(np.append(vals[3:],[0,0,0]))
    features.append(vals.mean())
    features.append(vals.std())
    features.append(vals.max())
    features.append(vals.min())
    features.append(vals_lag.mean())
    features.append(vals_lag.std())
    features.append(vals_lag.max())
    features.append(vals_lag.min())
    features.append(vals_2lag.mean())
    features.append(vals_2lag.std())
    features.append(vals_2lag.max())
    features.append(vals_2lag.min())
  fft = np.fft.fft(norm)
  time_vals = df_file[cols[0]].values
  dt = time_vals[1]-time_vals[0]
  n = len(norm)
  frequencies = np.fft.fftfreq(n, dt)
  features.append(len(time_vals))
  features.append(frequencies.mean())
  features.append(frequencies.std())
  features.append(frequencies.max())
  features.append(frequencies.min())
  features = np.array(features)
  return features,actucal_steps,activity


def create_steps_dataset(directory_files):
  X_run = []
  y_run = []
  X_walk = []
  y_walk = []
  for path in directory_files:
    try:
      cur_X,cur_y,activity = feature_extraction_steps(path,directory_path)
      if activity == 'run':
        X_run.append(cur_X)
        y_run.append(cur_y)
      else:
        X_walk.append(cur_X)
        y_walk.append(cur_y)
    except Exception as error:
      print(error)
      continue
  X_run = np.stack(X_run)
  y_run = np.array(y_run,dtype=object)
  X_run = np.nan_to_num(X_run, copy=True, nan=0.0, posinf=None, neginf=None)
  X_run = np.array(X_run,dtype=object)

  X_walk = np.stack(X_walk)
  y_walk = np.array(y_walk,dtype=object)
  X_walk = np.nan_to_num(X_walk, copy=True, nan=0.0, posinf=None, neginf=None)
  X_walk = np.array(X_walk,dtype=object)
  return X_run,y_run,X_walk,y_walk



def feature_extraction_activity(file_path,directory_path):
  df_file = pd.read_csv(os.path.join(directory_path, file_path),header=1,skip_blank_lines=True,skiprows =[0,1,2,3])
  df_file_info = pd.read_csv(os.path.join(directory_path, file_path),header=None,nrows=4,skip_blank_lines=True)
  activity = 1 if 'run' in df_file_info.iloc[2,1].lower() else 0

  norm = []
  cols = df_file.columns
  for i, row in df_file.iterrows():
    x = float(row[cols[1]])
    y = float(row[cols[2]])
    z = float(row[cols[3]])
    norm.append((x**2 + y**2 + z**2) ** 0.5)
  norm = np.array(norm)
  df_file['norm'] = norm
  features = []
  cols = df_file.columns
  for col_name in [cols[0],cols[3],cols[4]]:
    vals = df_file[col_name].values
    vals_lag = vals-(np.append(vals[1:],0))
    vals_2lag = vals-(np.append(vals[2:],[0,0]))
    vals_3lag = vals-(np.append(vals[3:],[0,0,0]))
    features.append(vals.mean())
    features.append(vals.std())
    features.append(vals.max())
    features.append(vals.min())
    features.append(vals_lag.mean())
    features.append(vals_lag.std())
    features.append(vals_lag.max())
    features.append(vals_lag.min())
    features.append(vals_2lag.mean())
    features.append(vals_2lag.std())
    features.append(vals_2lag.max())
    features.append(vals_2lag.min())
  fft = np.fft.fft(norm)
  time_vals = df_file[cols[0]].values
  dt = time_vals[1]-time_vals[0]
  n = len(norm)
  frequencies = np.fft.fftfreq(n, dt)
  features.append(len(time_vals))
  features.append(frequencies.mean())
  features.append(frequencies.std())
  features.append(frequencies.max())
  features.append(frequencies.min())
  features = np.array(features)
  return features,activity

def create_activity_dataset(directory_files):
  X = []
  y = []
  for path in directory_files:
    try:
      cur_X,cur_y = feature_extraction_activity(path,directory_path)
      X.append(cur_X)
      y.append(cur_y)
    except Exception as error:
      print(error)
      continue
  X = np.stack(X)
  y = np.array(y)
  X = np.nan_to_num(X, copy=True, nan=0.0, posinf=None, neginf=None)
  return X,y

In [9]:
directory_path = "/content/data"
directory_files = os.listdir(directory_path)

## Choose best Random Forest hyper-parameters and train

In [10]:
X,y = create_activity_dataset(directory_files)

could not convert string to float: 'walk_4_1.csv'


In [16]:
for criterion in ["gini", "entropy", "log_loss"]:
  for max_depth in [20,50,100]:
    for min_samples_split in [2,3,4]:
      clf = RandomForestClassifier(n_estimators=1000, random_state=0,criterion=criterion,max_depth=max_depth,min_samples_split=min_samples_split)
      scores = cross_val_score(clf, X, y, cv=5)
      print(f"Score:{scores.mean()},criterion = {criterion}, max_depth = {max_depth}, min_samples = {min_samples_split}")


Score:0.9486274509803921,criterion = gini, max_depth = 20, min_samples = 2
Score:0.9446274509803922,criterion = gini, max_depth = 20, min_samples = 3
Score:0.9406274509803921,criterion = gini, max_depth = 20, min_samples = 4
Score:0.9486274509803921,criterion = gini, max_depth = 50, min_samples = 2
Score:0.9446274509803922,criterion = gini, max_depth = 50, min_samples = 3
Score:0.9406274509803921,criterion = gini, max_depth = 50, min_samples = 4
Score:0.9486274509803921,criterion = gini, max_depth = 100, min_samples = 2
Score:0.9446274509803922,criterion = gini, max_depth = 100, min_samples = 3
Score:0.9406274509803921,criterion = gini, max_depth = 100, min_samples = 4
Score:0.9485490196078432,criterion = entropy, max_depth = 20, min_samples = 2
Score:0.9485490196078432,criterion = entropy, max_depth = 20, min_samples = 3
Score:0.9485490196078432,criterion = entropy, max_depth = 20, min_samples = 4
Score:0.9485490196078432,criterion = entropy, max_depth = 50, min_samples = 2
Score:0.94

In [17]:
clf = RandomForestClassifier(n_estimators=1000, random_state=0,criterion="gini",max_depth=100,min_samples_split=2)
clf.fit(X, y)
pickle.dump(clf, open('random_forest_model.pkl', 'wb'))
activity_model = pickle.load(open('random_forest_model.pkl', 'rb'))
score = activity_model.score(X,y)
print(score)

1.0


## Choose best XBBRegressor hyper-parameters for walk and train

In [18]:
X_run,y_run,X_walk,y_walk = create_steps_dataset(directory_files)

could not convert string to float: 'walk_4_1.csv'


In [29]:
X,y = X_walk,y_walk
for n_estimators in [1000,1500,2000]:
  for max_depth in [75,100,125,150]:
    for learning_rate in [None,0.004,0.005,0.006,0.007]:
      scores = []
      for i in [1,2,3,4,5]:
        model = XGBRegressor(n_estimators=n_estimators,max_depth= max_depth,learning_rate=learning_rate,booster="gbtree",objective ='reg:squarederror')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random.randint(3, 15))
        model.fit(X_train, y_train)
        predictions = model.predict(X_test).round()
        scores.append(mean_squared_error(predictions,y_test,squared=False))
      print(f"Score: {np.array(scores).mean()}, n_estimators: {n_estimators},max_depth: {max_depth},learning_rate: {learning_rate}")

Score: 15.306304697557803, n_estimators: 1000,max_depth: 75,learning_rate: None
Score: 17.71984338442169, n_estimators: 1000,max_depth: 75,learning_rate: 0.004
Score: 15.58774958819338, n_estimators: 1000,max_depth: 75,learning_rate: 0.005
Score: 17.46859156950994, n_estimators: 1000,max_depth: 75,learning_rate: 0.006
Score: 15.595789565671675, n_estimators: 1000,max_depth: 75,learning_rate: 0.007
Score: 15.784583654603882, n_estimators: 1000,max_depth: 100,learning_rate: None
Score: 15.245690151119433, n_estimators: 1000,max_depth: 100,learning_rate: 0.004
Score: 17.032198872660743, n_estimators: 1000,max_depth: 100,learning_rate: 0.005
Score: 17.93587178296649, n_estimators: 1000,max_depth: 100,learning_rate: 0.006
Score: 15.656918454553738, n_estimators: 1000,max_depth: 100,learning_rate: 0.007
Score: 18.66464304562607, n_estimators: 1000,max_depth: 125,learning_rate: None
Score: 19.126801346777754, n_estimators: 1000,max_depth: 125,learning_rate: 0.004
Score: 13.821895216576902, n_

In [47]:
# Best score
model = XGBRegressor(n_estimators=1500,max_depth= 100,booster="gbtree",objective ='reg:squarederror')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
model.fit(X_train, y_train)
predictions = model.predict(X_test).round()
print(f"RMSE: {mean_squared_error(predictions,y_test,squared=False)}")

RMSE: 9.97882373213011


In [49]:
X,y = X_walk,y_walk
model = XGBRegressor(n_estimators=1500,max_depth= 100,booster="gbtree",objective ='reg:squarederror')

model.fit(X, y)

pickle.dump(model, open('steps_walk_model.pkl', 'wb'))
walk_model = pickle.load(open('steps_walk_model.pkl', 'rb'))
predictions = walk_model.predict(X).round()
mean_squared_error(predictions,y,squared=False)

0.0

## Choose best XBBRegressor hyper-parameters for run and train

In [50]:
X,y = X_run,y_run
for n_estimators in [1000,1500,2000]:
  for max_depth in [None,75,100,125,150]:
    for learning_rate in [None,0.004,0.005,0.006,0.007]:
      scores = []
      for i in [1,2,3,4,5]:
        model = XGBRegressor(n_estimators=n_estimators,max_depth= max_depth,learning_rate=learning_rate,booster="gbtree",objective ='reg:squarederror')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random.randint(3, 15))
        model.fit(X_train, y_train)
        predictions = model.predict(X_test).round()
        scores.append(mean_squared_error(predictions,y_test,squared=False))
      print(f"Score: {np.array(scores).mean()}, n_estimators: {n_estimators},max_depth: {max_depth},learning_rate: {learning_rate}")

Score: 13.636031945015912, n_estimators: 1000,max_depth: None,learning_rate: None
Score: 15.595560805482773, n_estimators: 1000,max_depth: None,learning_rate: 0.004
Score: 14.686540113111004, n_estimators: 1000,max_depth: None,learning_rate: 0.005
Score: 14.110869816364783, n_estimators: 1000,max_depth: None,learning_rate: 0.006
Score: 12.475951908765031, n_estimators: 1000,max_depth: None,learning_rate: 0.007
Score: 14.673853523284862, n_estimators: 1000,max_depth: 75,learning_rate: None
Score: 16.269023255360075, n_estimators: 1000,max_depth: 75,learning_rate: 0.004
Score: 12.57642652306238, n_estimators: 1000,max_depth: 75,learning_rate: 0.005
Score: 12.72792696848945, n_estimators: 1000,max_depth: 75,learning_rate: 0.006
Score: 17.092649100140655, n_estimators: 1000,max_depth: 75,learning_rate: 0.007
Score: 16.257272159402035, n_estimators: 1000,max_depth: 100,learning_rate: None
Score: 12.592260715969521, n_estimators: 1000,max_depth: 100,learning_rate: 0.004
Score: 13.73319570406

In [59]:
# Best score
model = XGBRegressor(n_estimators=2000,max_depth= 125,learning_rate=0.005,booster="gbtree",objective ='reg:squarederror')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
model.fit(X_train, y_train)
predictions = model.predict(X_test).round()
print(f"RMSE: {mean_squared_error(predictions,y_test,squared=False)}")

RMSE: 9.772410142846033


In [60]:
X,y = X_run,y_run
model = XGBRegressor(n_estimators=2000,max_depth= 125,booster="gbtree",objective ='reg:squarederror')
model.fit(X, y)
pickle.dump(model, open('steps_run_model.pkl', 'wb'))
run_model = pickle.load(open('steps_run_model.pkl', 'rb'))
predictions = run_model.predict(X).round()
mean_squared_error(predictions,y,squared=False)

0.0