In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [61]:
# import data
df = pd.read_csv("../data/train.csv")
# source: https://www.kaggle.com/mboaglio/simplifiedhuarus
# Classification of activities of daily living using smartphone inertial sensors

# first 5 rows
df.sample(5)

Unnamed: 0,rn,activity,tBodyAcc.mean.X,tBodyAcc.mean.Y,tBodyAcc.mean.Z,tBodyAcc.std.X,tBodyAcc.std.Y,tBodyAcc.std.Z,tBodyAcc.mad.X,tBodyAcc.mad.Y,...,fBodyBodyGyroJerkMag.meanFreq,fBodyBodyGyroJerkMag.skewness,fBodyBodyGyroJerkMag.kurtosis,angle.tBodyAccMean.gravity,angle.tBodyAccJerkMean.gravityMean,angle.tBodyGyroMean.gravityMean,angle.tBodyGyroJerkMean.gravityMean,angle.X.gravityMean,angle.Y.gravityMean,angle.Z.gravityMean
2366,6761,WALKING_DOWNSTAIRS,0.316,-0.0481,-0.115,0.124,-0.105,-0.155,0.0759,-0.222,...,-0.0387,-0.484,-0.807,-0.288,0.919,0.938,-0.607,-0.853,0.123,-0.0862
133,409,LAYING,0.289,-0.0169,-0.109,-0.993,-0.976,-0.99,-0.993,-0.977,...,0.252,-0.544,-0.84,-0.0367,-0.0164,0.165,0.595,0.485,-0.531,-0.467
1220,3488,SITTING,0.281,-0.0123,-0.0966,-0.998,-0.982,-0.988,-0.998,-0.981,...,0.446,-0.468,-0.827,0.0105,-0.0268,-0.131,-0.0211,-0.857,0.166,0.0931
1605,4578,SITTING,0.284,-0.0202,-0.113,-0.991,-0.986,-0.969,-0.991,-0.984,...,0.252,-0.455,-0.782,-0.0599,0.121,0.0296,0.0334,-0.516,-0.095,-0.261
515,1456,WALKING_UPSTAIRS,0.33,-0.0361,-0.118,-0.383,-0.413,-0.203,-0.42,-0.432,...,-0.0803,-0.332,-0.734,-0.618,0.519,-0.401,-0.605,-0.478,0.203,0.375


In [62]:
# separate the target variable (y) with the features (X)
y = df["activity"]
X = df.drop(["activity"], axis = 1)

print(y.shape)
print(X.shape)

(3609,)
(3609, 562)


In [63]:
# split into training and testing sets for cross-validation, 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 123)

print(X_train.shape)
print(X_test.shape)

(2526, 562)
(1083, 562)


In [64]:
# use a pipeline to capture all steps necessary for to reach the trained model
# algorithm used here is the random forest
# any data preparation or cleaning steps goes before the model building, e.g.
pipeline_rf = Pipeline([("imputation", Imputer(missing_values = "Nan",
                                            strategy = "mean",
                                            axis = 0)), 
                     ("rf_classifier", RandomForestClassifier())])
# in this example, no data cleaning steps are required
pipeline_rf = Pipeline([("rf_classifier", RandomForestClassifier())])

In [65]:
# fit pipeline to training set (X_train and y_train)
pipeline_rf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('rf_classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [66]:
# now use the pipeline to make predictions on the testing set (X_test)
y_pred = pipeline_rf.predict(X_test)

In [67]:
# check accuracy with the "answer key" (y_test)
pipeline_rf.score(X_test, y_test)

0.95106186518928904

In [68]:
# repeat the steps with another algorithm
# gradient boosting

# make pipeline
pipeline_gb = Pipeline([("gb_classifier", GradientBoostingClassifier())])

# fit to training set
pipeline_gb.fit(X_train, y_train)

# make predictions
y_pred = pipeline_gb.predict(X_test)

# check accuracy
pipeline_gb.score(X_test, y_test)

0.97691597414589104