In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
# import data
df = pd.read_csv("../data/train.csv")
# source: https://www.kaggle.com/mboaglio/simplifiedhuarus
# Classification of activities of daily living using smartphone inertial sensors

# first 5 rows
df.sample(5)

Unnamed: 0,rn,activity,tBodyAcc.mean.X,tBodyAcc.mean.Y,tBodyAcc.mean.Z,tBodyAcc.std.X,tBodyAcc.std.Y,tBodyAcc.std.Z,tBodyAcc.mad.X,tBodyAcc.mad.Y,...,fBodyBodyGyroJerkMag.meanFreq,fBodyBodyGyroJerkMag.skewness,fBodyBodyGyroJerkMag.kurtosis,angle.tBodyAccMean.gravity,angle.tBodyAccJerkMean.gravityMean,angle.tBodyGyroMean.gravityMean,angle.tBodyGyroJerkMean.gravityMean,angle.X.gravityMean,angle.Y.gravityMean,angle.Z.gravityMean
1788,5111,SITTING,0.28,-0.0178,-0.108,-0.996,-0.985,-0.983,-0.996,-0.987,...,0.175,-0.28,-0.57,-0.252,0.368,-0.0631,-0.649,-0.316,-0.274,-0.259
1214,3474,SITTING,0.279,-0.0174,-0.112,-0.998,-0.991,-0.988,-0.998,-0.99,...,0.528,-0.779,-0.937,-0.118,-0.151,-0.24,-0.191,-0.892,0.0207,0.00637
1399,3973,STANDING,0.282,-0.0141,-0.108,-0.995,-0.985,-0.986,-0.996,-0.984,...,0.354,-0.574,-0.838,-0.131,-0.628,-0.897,-0.784,-0.653,0.3,0.15
2452,7021,SITTING,0.276,-0.0146,-0.106,-0.996,-0.978,-0.981,-0.996,-0.982,...,0.169,-0.107,-0.429,0.0681,-0.00418,0.0104,0.261,-0.767,0.131,-0.148
1094,3118,SITTING,0.275,-0.0165,-0.0945,-0.989,-0.968,-0.96,-0.99,-0.965,...,0.235,-0.469,-0.784,-0.00887,0.0285,0.692,-0.632,-0.861,0.139,-0.074


In [3]:
# separate the target variable (y) with the features (X)
y = df["activity"]
X = df.drop(["activity"], axis = 1)

print(y.shape)
print(X.shape)

(3609,)
(3609, 562)


In [4]:
# split into training and testing sets for cross-validation, 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 123)

print(X_train.shape)
print(X_test.shape)

(2526, 562)
(1083, 562)


In [5]:
# use a pipeline to capture all steps necessary for to reach the trained model
# algorithm used here is the random forest
# any data preparation or cleaning steps goes before the model building, e.g.
pipeline_rf = Pipeline([("imputation", Imputer(missing_values = "Nan",
                                            strategy = "mean",
                                            axis = 0)), 
                     ("rf_classifier", RandomForestClassifier())])
# in this example, no data cleaning steps are required
pipeline_rf = Pipeline([("rf_classifier", RandomForestClassifier())])

In [6]:
# fit pipeline to training set (X_train and y_train)
pipeline_rf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('rf_classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [7]:
# now use the pipeline to make predictions on the testing set (X_test)
y_pred = pipeline_rf.predict(X_test)

In [8]:
# check accuracy with the "answer key" (y_test)
pipeline_rf.score(X_test, y_test)

0.9455216989843028

In [9]:
# repeat the steps with another algorithm
# gradient boosting

# make pipeline
pipeline_gb = Pipeline([("gb_classifier", GradientBoostingClassifier())])

# fit to training set
pipeline_gb.fit(X_train, y_train)

# make predictions
y_pred = pipeline_gb.predict(X_test)

# check accuracy
pipeline_gb.score(X_test, y_test)

0.9778393351800554