In [3]:
# 1. Complete an exploratory analysis with visualizations of the data to come up with some interesting thoughts/patterns and describe your findings about the data.
# 2. Develop a Machine Learning model to classify the activity based on features present in theneckband. What would be the most optimal model to run on the Neckband?
# 3. Share your code through GitHub or any other preferred way on or before the interview date.
# 4. Please put a summary of your exploration, model development and resulting into a brief presentation to share back with the interviewers.

# Setup and Config.

In [4]:
!pip install -r requirements.txt

Collecting datetime
  Using cached DateTime-4.3-py2.py3-none-any.whl (60 kB)
Collecting zope.interface
  Downloading zope.interface-5.4.0-cp36-cp36m-macosx_10_14_x86_64.whl (208 kB)
[K     |████████████████████████████████| 208 kB 1.1 MB/s 
Installing collected packages: zope.interface, datetime
Successfully installed datetime-4.3 zope.interface-5.4.0
You should consider upgrading via the '/Users/pthirukonda1/anaconda/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import datetime
pd.set_option('expand_frame_repr', False)

Exploratory Data Analysis.

In [6]:
df = pd.read_csv("../data/IMU.csv", dtype={'activity' : 'category'}, parse_dates=['UnixTime','gps_unixTime'], date_parser=lambda epoch: pd.to_datetime(float(epoch)/1000))

#print(df.head(5))
print("\n\n\nDataframe Info:\n")
print(df.info)
print("\n\n\n DF DTypes:\n")
print(df.dtypes)




Dataframe Info:

<bound method DataFrame.info of                          UnixTime  cumulativeAudio  cumulativeShock   lat_sMins   lon_wMins               gps_unixTime  heading_hundredths  velocity_cm_s  hdop  satellitesUsed  ...  gyro_dps.x  gyro_dps.y  gyro_dps.z  mag_nT.x  mag_nT.y  mag_nT.z  IMUspeed  IMUfSpeed  pitch  activity
0      1970-01-01 00:00:01.532740               19                 5 -27.788683  152.686050 1970-01-01 00:00:01.532740              184.14              0   0.8              10  ...          -1           0          -1    -11686    -11850    -24150        22         10     13         0
1      1970-01-01 00:00:01.532740               19                 5 -27.788683  152.686050 1970-01-01 00:00:01.532740              184.14              0   0.8              10  ...          -1           0          -1    -11686    -11850    -24150        22         10     13         0
2      1970-01-01 00:00:01.532740               19                 5 -27.788683  152.686050 1

In [36]:
# Pandas Profiling FTW!
from pandas_profiling import ProfileReport
profiling_report = ProfileReport(df.sample(n=250000))
profiling_report.to_file(output_file='../analysis_outputs/profiling_report_output_sampled250000.html')
profiling_report = ProfileReport(df.sample(n=100000), minimal=True)
profiling_report.to_file(output_file='../analysis_outputs/profiling_report_output_minimalTrue.html')

Summarize dataset: 100%|██████████| 38/38 [01:58<00:00,  3.13s/it, Completed]
Generate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.31s/it]
Render HTML: 100%|██████████| 1/1 [00:12<00:00, 12.86s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  9.60it/s]
Summarize dataset: 100%|██████████| 33/33 [00:00<00:00, 101.13it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.33s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 339.43it/s]


Machine Learning Model


AutoML

In [9]:
df.drop(['UnixTime', 'gps_unixTime'], axis = 1)

Unnamed: 0,cumulativeAudio,cumulativeShock,lat_sMins,lon_wMins,heading_hundredths,velocity_cm_s,hdop,satellitesUsed,validPosition,accel_mg.x,...,gyro_dps.x,gyro_dps.y,gyro_dps.z,mag_nT.x,mag_nT.y,mag_nT.z,IMUspeed,IMUfSpeed,pitch,activity
0,19,5,-27.788683,152.686050,184.14,0,0.8,10,1,-242,...,-1,0,-1,-11686,-11850,-24150,22,10,13,0
1,19,5,-27.788683,152.686050,184.14,0,0.8,10,1,-250,...,-1,0,-1,-11686,-11850,-24150,22,10,13,0
2,19,5,-27.788683,152.686050,184.14,0,0.8,10,1,-258,...,-1,0,0,-10036,-12900,-24000,22,10,13,0
3,19,5,-27.788683,152.686050,184.14,0,0.8,10,1,-261,...,-1,0,0,-10036,-12900,-24000,22,10,13,0
4,19,5,-27.788683,152.686050,184.14,0,0.8,10,1,-261,...,-1,-1,0,-10036,-12900,-24000,22,10,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558395,19,5,-27.788589,152.689301,29.75,0,0.5,22,1,875,...,-1,0,-2,4200,14700,24436,3,13,-45,1
558396,19,5,-27.788589,152.689301,29.75,0,0.5,22,1,834,...,-1,1,-1,4200,14700,24436,3,13,-45,1
558397,19,5,-27.788589,152.689301,29.75,0,0.5,22,1,806,...,-1,1,-1,4200,14700,24436,3,13,-45,1
558398,19,5,-27.788589,152.689301,29.75,0,0.5,22,1,727,...,0,1,-1,5250,13650,24286,3,13,-45,1


In [13]:
# check tpot version
import tpot
print('tpot: %s' % tpot.__version__)



from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier
# define dataset
X, y = df.loc[:, df.columns != 'activity'], df[['activity']]
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(X, y)
# export the best model
model.export('tpot_best_model.py')

tpot: 0.11.7


TypeError: invalid type promotion