In [1]:
import glob
import pandas as pd
import numpy as np
import time
from IPython.display import display, HTML

#from sklearn.metrics import accuracy_score
#from sklearn.decomposition import PCA
#from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB


In [2]:
def clean_label(label):
    return label.lstrip(',').rstrip(',').replace(',,', ',')

In [18]:
INPUT_FOLDER = '../processed_data/'
headers_metadf = ['trajectory_id', 'start_time', 'end_time', 'v_ave', 'v_med', 'a_ave', 'a_med', 'labels']

list_df_metadata = []

for file in glob.glob(INPUT_FOLDER + "*_metadata.csv"):
    df_metadata = pd.read_csv(file, index_col=0)
    list_df_metadata.append(df_metadata)

df_metadata = pd.concat(list_df_metadata)

df_labeled = df_metadata.dropna(subset=['v_ave','v_med','a_ave', 'a_med', 'labels'])
df_labeled.loc[:,'labels'] = df_labeled['labels'].apply(lambda x: clean_label(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [19]:
all_labels = df_labeled['labels'].unique()
single_modality_labels = [elem for elem in all_labels if ',' not in elem]

df_single_modality = df_labeled[df_labeled['labels'].isin(single_modality_labels)]


print("Total number of trajectories: {}".format(len(df_metadata)))
print("Number of labeled trajectories: {}".format(len(df_labeled)))
print("Number of single modality trajectories: {}".format(len(df_single_modality)))

Total number of trajectories: 18670
Number of labeled trajectories: 4468
Number of single modality trajectories: 2812


In [20]:
mask = np.random.rand(len(df_single_modality)) < 0.7
df_train = df_single_modality[mask]
df_test = df_single_modality[~mask]

print(len(df_train))

2007


In [22]:
X_colnames = ['v_ave','v_med','a_ave', 'a_med']
Y_colnames = ['labels']

X_train = df_train[X_colnames].values
Y_train = np.ravel(df_train[Y_colnames].values)
X_test = df_test[X_colnames].values
Y_test = np.ravel(df_test[Y_colnames].values)

In [23]:
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators = 18),
    "Neural Net": MLPClassifier(alpha = 1),
    #"AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    #"QDA": QuadraticDiscriminantAnalysis() 
}

no_classifiers = len(dict_classifiers.keys())

df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,4)), columns = ['classifier', 'train_score', 'test_score', 'training_time'])

count = 0
for key, classifier in dict_classifiers.items():
    t_start = time.clock()
    classifier.fit(X_train, Y_train)
    train_score = classifier.score(X_train, Y_train)
    test_score = classifier.score(X_test, Y_test)
    t_end = time.clock()
    t_diff = t_end - t_start
    df_results.loc[count,'classifier'] = key
    df_results.loc[count,'train_score'] = train_score
    df_results.loc[count,'test_score'] = test_score
    df_results.loc[count,'training_time'] = t_diff
    print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
    count+=1

display(df_results.sort_values(by='test_score', ascending=False))

trained Logistic Regression in 0.11 s
trained Nearest Neighbors in 0.09 s
trained Linear SVM in 0.34 s
trained Gradient Boosting Classifier in 1.63 s
trained Decision Tree in 0.01 s
trained Random Forest in 0.08 s
trained Neural Net in 0.66 s
trained Naive Bayes in 0.01 s


Unnamed: 0,classifier,train_score,test_score,training_time
5,Random Forest,0.99153,0.730435,0.075782
2,Linear SVM,0.72845,0.71677,0.340654
3,Gradient Boosting Classifier,0.904335,0.71677,1.629625
1,Nearest Neighbors,0.765321,0.700621,0.088311
6,Neural Net,0.672646,0.67205,0.660142
0,Logistic Regression,0.630294,0.643478,0.106787
4,Decision Tree,1.0,0.642236,0.0139
7,Naive Bayes,0.249128,0.267081,0.012185
