Data Preprocessing and Logistic Regression

In [1]:
%matplotlib inline
import matplotlib as mpl;
import matplotlib.pyplot as plt;

<a name="read_user_data"></a>
<b>Reading user data</b>.<br/>
Each user has a separate data file that contains several pieces of the data, mainly features and labels.<br/>
To read the different data pieces for a user, you can use the following functions:

In [68]:
import numpy as np
import gzip
#import StringIO;
import glob
import pandas as pd

def read_all_user_data():
    file_names = glob.glob('*')
    #file_names = os.listdir(".")
    all_files_df = pd.DataFrame()
    for file in file_names: 
        if file != "ExtraSensory_DataProcessing.ipynb":
            df = pd.read_csv(file, compression='gzip')
            all_files_df = all_files_df.append(other=df)
            #break
        
    header_names = all_files_df.columns.values
    timestamps = all_files_df[header_names[1]]
    X = all_files_df[header_names[1:-52]]
    Y = all_files_df[header_names[-52:-1]]
    M = np.isnan(Y); # M is the missing label matrix
    feature_names = header_names[1:52]
    label_names = header_names[-52:-1]
    return X,Y,M,timestamps,feature_names,label_names
                

In [69]:
(X,Y,M,timestamps,feature_names,label_names) = read_all_user_data()

<a name="context_recognition"></a>

Now, we can try the task of <b>context recognition</b>: predicting the context-labels based on the sensor-features.<p/>

We'll use linear models, specifically a logistic-regression classifier, to predict a single binary label.<br/>
We can choose which sensors to use.

In [70]:
#choose sensors to use
def get_sensor_names_from_features(feature_names):
    feat_sensor_names = {'Gyro': [], 'Acc': [], 'Magnet': [], 'WAcc': [], 'Compass': [], 'Loc': [], 'AP': [], 'PS': [], 'LF': []}#np.array([None for feat in feature_names]);
    for (fi,feat) in enumerate(feature_names):
        if feat.startswith('raw_acc'):
            feat_sensor_names['Acc'].append(feat)
            pass;
        elif feat.startswith('proc_gyro'):
            feat_sensor_names['Gyro'].append(feat);
            pass;
        elif feat.startswith('raw_magnet'):
            feat_sensor_names['Magnet'].append(feat);
            pass;
        elif feat.startswith('watch_acceleration'):
            feat_sensor_names['WAcc'].append(feat);
            pass;
        elif feat.startswith('watch_heading'):
            feat_sensor_names['Compass'].append(feat);
            pass;
        elif feat.startswith('location'):
            feat_sensor_names['Loc'].append(feat);
            pass;
        elif feat.startswith('location_quick_features'):
            feat_sensor_names['Loc'].append(feat);
            pass;
        elif feat.startswith('audio_naive'):
            feat_sensor_names['Aud'].append(feat);
            pass;
        elif feat.startswith('audio_properties'):
            feat_sensor_names['AP'].append(feat);
            pass;
        elif feat.startswith('discrete'):
            feat_sensor_names['PS'].append(feat);
            pass;
        elif feat.startswith('lf_measurements'):
            feat_sensor_names['LF'].append(feat);
            pass;
        else:
            raise ValueError("!!! Unsupported feature name: %s" % feat);

        pass;

    return feat_sensor_names;  
feat_sensor_names = get_sensor_names_from_features(feature_names);

In [72]:
#choose sensors to use

In [85]:
import sklearn.linear_model;
import sklearn.metrics as metrics;
from sklearn.model_selection import (train_test_split,KFold)

def get_sensors_to_use(features_to_use, feat_sensor_names):
    sensors_list = []
    for f in features_to_use: 
        sensors_list += feat_sensor_names[f]
    return sensors_list

def train_model(X_train,Y_train,M_train,sensors_to_use, target_label):
    #Logistic Regression
    #normalize features?
    X_train = X_train[sensors_to_use]
    y = Y_train[target_label]
    missing_label_indexes = M_train.index[M_train[target_label] == True].tolist()
    X_train = X_train.drop(index=missing_label_indexes)
    y = y.drop(index=missing_label_indexes)
    
    kf = KFold(n_splits=5, random_state=42)

    for train_index, test_index in kf.split(X_train):
        train_X, test_X = X_train.iloc[train_index], X_train.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]
    
        lr_model = sklearn.linear_model.LogisticRegression(class_weight='balanced')
        lr_model.fit(train_X, train_y)
        y_pred = lr_model.predict(train_X)
        score = metrics.accuracy_score(train_y, y_pred)
        print(f"Accuracy for train data: {score}")
        
        y_pred = lr_model.predict(test_X)
        score = metrics.accuracy_score(test_y, y_pred)
        print(f"Accuracy for test data: {score}")
        
    return lr_model

In [86]:
features = ["Acc", "WAcc"]
sensors_to_use = get_sensors_to_use(features, feat_sensor_names)
target_label = 'label:FIX_walking'
model = train_model(X,Y,M, sensors_to_use, target_label)

Accuracy for train data: 0.8956494325346784
Accuracy for test data: 0.9596977329974811
Accuracy for train data: 0.921840529467381
Accuracy for test data: 0.9104665825977302
Accuracy for train data: 0.8928458871730224
Accuracy for test data: 0.8575031525851198
Accuracy for train data: 0.9234163252442483
Accuracy for test data: 0.8839848675914249
Accuracy for train data: 0.913646391427671
Accuracy for test data: 0.9155107187894073


