In [8]:
# This notebook is an experiment to use the scikit-learn RandomForestClassifier on the 
# "Human Activity Recognition Using Smartphones Data Set" from the UC Irvine Machine Learning Repository.
# See https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones .
# The code below is quick and dirty, but the cool thing is that with minimal effort and no parameter
# tuning at all, it was possible to get pretty good initial results. See below.
# To run this notebook, download the dataset from the above URL and expand it. Put the notebook file in
# the top-level directory of the dataset and you're good to go.

In [9]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [10]:
# Read feature names into a Dataframe, then extract a Pandas Series
feature_names = pd.read_csv('features.txt', names = ['num', 'name'], delim_whitespace=True)['name']

In [11]:
# Read the training input data into a Dataframe
X_train = pd.read_csv('train/X_train.txt', names = feature_names, delim_whitespace=True)

In [12]:
# Read the training input labels into a Dataframe, then extract values into a NumPy ndarray
y_train = pd.read_table('train/y_train.txt', header=None).values.ravel()

In [13]:
# Create a RandomForestClassifier. Set n_jobs to -1 to use all available cores.
clf = RandomForestClassifier(n_jobs=-1)

In [14]:
# Fit the classifier to the training data
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
# Read the test input data into a Dataframe
X_test = pd.read_csv('test/X_test.txt', names = feature_names, delim_whitespace=True)

In [16]:
# Read the test input labels into a Dataframe, then extract values into an ndarray
y_test = pd.read_table('test/y_test.txt', header=None).values.ravel()

In [17]:
# Predict results on the test set, as label indices
pred_values = clf.predict(X_test)

In [18]:
# Load the activity labels into a Dataframe, then extract a Pandas Series
activity_labels = pd.read_csv('activity_labels.txt', names = ['num', 'name'], delim_whitespace=True)['name']

In [19]:
# Map the predicted label indices to an ndarray of labels
preds = activity_labels[pred_values].values.ravel()

In [20]:
# Map the known test label indices to an ndarray of labels
y_test_labels = activity_labels[y_test].values.ravel()

In [21]:
# Compare the test predictions with actual known labels, using crosstab
pd.crosstab(y_test_labels, preds, rownames=['actual'], colnames=['preds'])

preds,LAYING,SITTING,STANDING,WALKING_DOWNSTAIRS,WALKING_UPSTAIRS
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LAYING,468,0,64,0,0
SITTING,0,351,0,46,23
STANDING,45,0,446,0,0
WALKING_DOWNSTAIRS,0,11,0,409,51
WALKING_UPSTAIRS,0,12,0,10,474
