FINAL PROJECT: **Activities Recognition**.

**Pattern Recognition & Machine Learning.**

**UPC - MUAR**

Authors: **Javier Pedrosa Alias & Óscar Palacín Domínguez.**

# Initialization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Import all the necessary libraries.

In [None]:
from google.colab import drive
import io
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import time
import statistics as st
from sklearn.metrics import confusion_matrix

# Principal Component Analysis.
from sklearn import decomposition      
from sklearn.preprocessing import StandardScaler  

# Feature selection methods.
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

# Clustering.
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn import mixture

# Probabilistic classifiers.
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Random forests.
from sklearn.ensemble import RandomForestClassifier

# Gradient boosting.
from sklearn.ensemble import GradientBoostingClassifier

# Performance evaluation.
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Data handling

Upload the CSV dataset; convert it to a pandas dataframe; and replace the NaNs by the means of the same class values, in the case there are some ones:

In [None]:
t0 = time.time()

complete_ds = pd.read_csv('/content/drive/MyDrive/PRML/PRML - FP/dataset.csv',sep=",", header=0)
ds_shape = complete_ds.shape

# Eliminate the test samples.
mask = complete_ds.label != "test"  # Get a boolean vector for each data row.
ds = complete_ds.loc[mask]

# Discard sample with NaNs.
ds_nonnan = ds.dropna()

# Save the target.
target = ds["label"]
labels = list(set(target))

# Remove last headers (because they are not data, are checksums to debug).
undesired_headers = ["fields_num", "props_num", "#ref", "Time_max", "Time_min", "Time_mean", "Time_q1", "Time_q2", "Time_q3", "LatitudeDegrees_max", "LatitudeDegrees_min", "LatitudeDegrees_mean", "LatitudeDegrees_q1", "LatitudeDegrees_q2", "LatitudeDegrees_q3", "LongitudeDegrees_max", "LongitudeDegrees_min", "LongitudeDegrees_mean", "LongitudeDegrees_q1", "LongitudeDegrees_q2", "LongitudeDegrees_q3"]
for header in undesired_headers:
    del ds[header]


# Check the NaN gaps.
for label in labels:
    mask = ds.label == label
    curr_ds = ds.loc[mask]
    print(label)

    for header in ds.keys():
        curr_list = list(curr_ds[header])
        if header != "label":
            curr_nonnans = []
            index_list = []  # Save a reference to know which values are not a number.
            for i in range(len(curr_list)):
                if not np.isnan(curr_list[i]):
                    curr_nonnans.append(curr_list[i])
                else:
                    index_list.append(curr_ds.index[i])

            if len(curr_nonnans) != 0 and len(index_list) != 0:
                curr_mean = st.mean(curr_nonnans)
                for i in index_list:
                    ds[header][i] = curr_mean
                # mask = bool_list
                # column = header
                # complete_ds.loc[mask, column] = curr_mean  # Substitute the true values of mask in the specified column.
            elif len(curr_nonnans) == 0:
                print("All 0 for: " + label + " -> " + header)
                for i in index_list:
                    ds[header][i] = 0

ds.shape

# Dimensionallity reduction

Different dimensionallity approaches are going to be performed. The one that shows a better behavior will be the selected.
1. First is necessary to split attributes and class labels.

In [None]:
X = ds[['AltitudeMeters_max','AltitudeMeters_min','AltitudeMeters_mean','AltitudeMeters_q1','AltitudeMeters_q2', 'AltitudeMeters_q3',\
        'HeartRatebpm_max','HeartRatebpm_min','HeartRatebpm_mean','HeartRatebpm_q1','HeartRatebpm_q2', 'HeartRatebpm_q3',\
        'Speed_max','Speed_min','Speed_mean','Speed_q1','Speed_q2', 'Speed_q3','RunCadence_max','RunCadence_min','RunCadence_mean',\
        'RunCadence_q1','RunCadence_q2','RunCadence_q3','gps_speed_min','gps_speed_mean','gps_speed_q2','gps_speed_q3','gps_acc_min',\
        'gps_acc_mean','gps_acc_q2','gps_acc_q3']]
y = ds['label']

## PCA

Scaling data (normalize) before performing PCA.

In [None]:
XS = StandardScaler().fit_transform(X)
XS.shape

PCA to scaled data.

In [None]:
pca = decomposition.PCA(n_components=32).fit(XS)
print('eigenvalues = {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} '.format(*pca.explained_variance_))
print('Explained variance = {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} '.format(*pca.explained_variance_ratio_))

Principal components and explained variance


In [None]:
100*pca.explained_variance_ratio_.cumsum()

The cumulative summatory of the explained variance ratio should be greater than a 95%, since the eight principal components with largest eigenvalues capture 95.95% of the variance in the data.
Project the data in a space of reduced dimensionality:

In [None]:
XS_pca = pca.transform(XS)
XS_pca.shape

In [None]:
dfpca = pd.DataFrame(XS_pca[:,0:8],columns=['PCA1', 'PCA2','PCA3', 'PCA4','PCA5', 'PCA6','PCA7','PCA8'])

fig = plt.figure(figsize=(8,8))

dfpca['label'] = y.array

ax_pca = sns.pairplot(data=dfpca, hue= 'label')
plt.show()

## Feature selection methods

To know which features are the most relevant in the dataset, different feature selection methods are going to be applied.

### Correlation heatmap

In [None]:
# Get correlations of each features in dataset.
corrmat = X.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(30,30))

# Plot heat map.
g=sns.heatmap(X[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
# With the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature.

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X,0.7);
len(set(corr_features))

In [None]:
corr_features

In [None]:
# Drop the highly correlated features.
X_corr = X.drop (corr_features,axis=1)
X_corr.columns

In [None]:
print("Original dataset dimension is = {} and the obtained with drop out highly correlated values is = {}.".format(X.columns.size,X_corr.columns.size))
print("The less correlated labels are: {}".format(X_corr.columns))

Normalize the data before using it.

In [None]:
XS_corr = StandardScaler().fit_transform(X_corr)

Plot the dataset obtained.

In [None]:
dfcorr = pd.DataFrame(XS_corr,columns=['AltitudeMeters_max', 'HeartRatebpm_max', 'Speed_max', 'Speed_min',
       'RunCadence_min', 'gps_speed_min', 'gps_acc_min', 'gps_acc_mean',
       'gps_acc_q2', 'gps_acc_q3'])

fig = plt.figure(figsize=(8,8))

dfcorr['label'] = y.array

ax_corr = sns.pairplot(data=dfcorr, hue = 'label')
plt.show()

### K best feature selector (NOT APPLIED!)

F_classif ANOVA F-value between label/feature for classification tasks.

In [None]:
X_f_class = SelectKBest(f_classif, k=20).fit_transform(X, y) 
X_f_class.shape

Mutual_info_classif Mutual information for a discrete target.

In [None]:
X_f_mutual_info_classif = SelectKBest(mutual_info_classif, k=20).fit_transform(X, y) 
X_f_mutual_info_classif.shape

It does not work because X have negative values!

In [None]:
# X_chi = SelectKBest(chi2, k=1).fit_transform(X, y) 

# Supervised techniques

Uncomment the line that you want to use and comment the other one:
1. PCA Dimensionallity reduction dataset.
2. Correlation Dimensionallity reduction.

In [None]:
# Xtrain, Xtest, ytrain, ytest = train_test_split(XS_pca, y, random_state=0,train_size=0.7)     # PCA
Xtrain, Xtest, ytrain, ytest = train_test_split(XS_corr, y, random_state=0,train_size=0.7)  # Correlation

## Probabilistic classifiers

Five different Probabilistic Classifiers are been considered:
1. Linear Discriminant Analysis (LDA).
2. Quadratic Discriminant Analysis (QDA).
3. Naive Bayes (NB).
4. K-Neares Neighbours (KNN).
5. Decision Trees (DT).

### LDA

In [None]:
clf_lda = LinearDiscriminantAnalysis(n_components=7,priors=None)
clf_lda_trained = clf_lda.fit(Xtrain, ytrain)

# Make predictions.
ypred_lda = clf_lda.predict(Xtest)

# Performance evaluation.
as_lda = accuracy_score(ytest, ypred_lda)
cm_lda = confusion_matrix(ytest, ypred_lda) # The confusion matrix is configured as: columns -> Real | Rows -> Predicted
cr_lda = classification_report(ytest,ypred_lda)


# Performance evaluation.
print('Accuracy score: ')
print(as_lda)
print('Confusion matrix: ')
print(cm_lda)
print('Classification report: ')
print(cr_lda)

# Plot the Confusion Matrix.
sns.heatmap(cm_lda/np.sum(cm_lda), annot=True, linewidths=0.5, linecolor="green", fmt=".2%")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title('LDA')
plt.show()

### QDA





In [None]:
clf_qda = QuadraticDiscriminantAnalysis(priors=None,store_covariance=True)
clf_qda_trained = clf_qda.fit(Xtrain, ytrain)

# Make predictions.
ypred_qda = clf_qda.predict(Xtest)

# Performance evaluation.
as_qda = accuracy_score(ytest, ypred_qda)
cm_qda = confusion_matrix(ytest, ypred_qda) # The confusion matrix is configured as: columns -> Real | Rows -> Predicted
cr_qda = classification_report(ytest,ypred_qda)


# Performance evaluation.
print('Accuracy score: ')
print(as_qda)
print('Confusion matrix: ')
print(cm_qda)
print('Classification report: ')
print(cr_qda)

# Plot the Confusion Matrix.
sns.heatmap(cm_qda/np.sum(cm_qda), annot=True, linewidths=0.5, linecolor="green", fmt=".2%")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title('QDA')
plt.show()

### NB

In [None]:
clf_gnb = GaussianNB()
ytrained_gnb = clf_gnb.fit(Xtrain, ytrain)

# Make predictions.
ypred_gnb = clf_gnb.predict(Xtest)

# Performance evaluation.
as_gnb = accuracy_score(ytest, ypred_gnb)
cm_gnb  = confusion_matrix(ytest, ypred_gnb) # The confusion matrix is configured as: columns -> Real | Rows -> Predicted
cr_gnb  = classification_report(ytest,ypred_gnb)


# Performance evaluation.
print('Accuracy score: ')
print(as_gnb)
print('Confusion matrix: ')
print(cm_gnb)
print('Classification report: ')
print(cr_gnb)

# Plot the Confusion Matrix.
sns.heatmap(cm_gnb/np.sum(cm_gnb), annot=True, linewidths=0.5, linecolor="green", fmt=".2%")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title('Naive Bayes')
plt.show()

### KNN

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=3)
ytrained_knn = clf_knn.fit(Xtrain, ytrain) 

# Make predictions.
ypred_knn = clf_knn.predict(Xtest)

# Performance evaluation.
as_knn = accuracy_score(ytest, ypred_knn)
cm_knn  = confusion_matrix(ytest, ypred_knn) # The confusion matrix is configured as: columns -> Real | Rows -> Predicted
cr_knn  = classification_report(ytest,ypred_knn)


# Performance evaluation.
print('Accuracy score: ')
print(as_knn)
print('Confusion matrix: ')
print(cm_knn)
print('Classification report: ')
print(cr_knn)

# Plot the Confusion Matrix.
sns.heatmap(cm_knn/np.sum(cm_knn), annot=True, linewidths=0.5, linecolor="green", fmt=".2%")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title('Decision Trees')
plt.show()

### DT

In [None]:
clf_dt = tree.DecisionTreeClassifier()
ytrained_dt = clf_dt.fit(Xtrain,ytrain)

# Make predictions.
ypred_dt = clf_dt.predict(Xtest)

# Performance evaluation.
as_dt = accuracy_score(ytest, ypred_dt)
cm_dt  = confusion_matrix(ytest, ypred_dt) # The confusion matrix is configured as: columns -> Real | Rows -> Predicted
cr_dt  = classification_report(ytest,ypred_dt)


# Performance evaluation.
print('Accuracy score: ')
print(as_dt)
print('Confusion matrix: ')
print(cm_dt)
print('Classification report: ')
print(cr_dt)

# Plot the Confusion Matrix.
sns.heatmap(cm_dt/np.sum(cm_dt), annot=True, linewidths=0.5, linecolor="green", fmt=".2%")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title('Decision Trees')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
ax = plot_tree(ytrained_dt) 
plt.show()

## Random Forests (RF)

In [None]:
clf_rf = RandomForestClassifier(max_depth=None, random_state=0)
ytrained_rf = clf_rf.fit(Xtrain, ytrain)

# Make predictions.
ypred_rf = clf_rf.predict(Xtest)

# Performance evaluation.
as_rf = accuracy_score(ytest, ypred_rf)
cm_rf = confusion_matrix(ytest, ypred_rf) # The confusion matrix is configured as: columns -> Real | Rows -> Predicted
cr_rf = classification_report(ytest,ypred_rf)


# Performance evaluation.
print('Accuracy score: ')
print(as_rf)
print('Confusion matrix: ')
print(cm_rf)
print('Classification report: ')
print(cr_rf)

# Plot the Confusion Matrix.
sns.heatmap(cm_rf/np.sum(cm_rf), annot=True, linewidths=0.5, linecolor="green", fmt=".2%")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title('Random forests')
plt.show()

##Gradient Bosting (GB)

In [None]:
# Gradient Boosting Classifier parameters.
nestimator = 100
learningrate = 1

# Training the model.
clf_gb = GradientBoostingClassifier(n_estimators=nestimator, learning_rate=learningrate, max_depth=None, random_state=0)
ytrained_gb = clf_gb.fit(Xtrain, ytrain)

# Make predictions.
ypred_gb = clf_gb.predict(Xtest)

# Performance evaluation.
as_gb = accuracy_score(ytest, ypred_gb)
cm_gb = confusion_matrix(ytest, ypred_gb) # The confusion matrix is configured as: columns -> Real | Rows -> Predicted
cr_gb = classification_report(ytest,ypred_gb)


# Performance evaluation.
print('Accuracy score: ')
print(as_gb)
print('Confusion matrix: ')
print(cm_gb)
print('Classification report: ')
print(cr_gb)

# Plot the Confusion Matrix.
sns.heatmap(cm_gb/np.sum(cm_gb), annot=True, linewidths=0.5, linecolor="green", fmt=".2%")
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title('Random forests')
plt.show()

# Performance evaluation

In these section two different performance evaluation are computed for all the classifier:
1. Confusion matrix.
2. Classification report.

## Confusion matrix

In [None]:
print('Confusion matrix LDA: ')
print(cm_lda)
print('Confusion matrix QDA: ')
print(cm_qda)
print('Confusion matrix NB: ')
print(cm_gnb)
print('Confusion matrix KNN: ')
print(cm_knn)
print('Confusion matrix DT: ')
print(cm_dt)
print('Confusion matrix RF: ')
print(cm_rf)
print('Confusion matrix GB: ')
print(cm_gb)

In [None]:
classifiers = ["LDA", "QDA", "GNB", "KNN", "DT", "RF", "GB"]
confusion_matrices = [cm_lda, cm_qda, cm_gnb, cm_knn, cm_dt, cm_rf, cm_gb]

In [None]:
fig, axes = plt.subplots(nrows=1,ncols=7,figsize=(21,3))

for cls, cfm, ax in zip(classifiers, confusion_matrices, axes.flatten()):
  sns.heatmap(cfm, annot=True, linewidths=0.5, linecolor="green", fmt=".0f", ax = ax, cmap= "Blues")
  ax.set_xlabel("y_pred")
  ax.set_ylabel("y_true")
  # ax.title.set_text(type(cls).__name__)
  ax.title.set_text(cls)

plt.tight_layout()    
plt.show()

## Classification report

In [None]:
print('Classificaiton report LDA: ')
print(cr_lda)
print('Classificaiton report QDA: ')
print(cr_qda)
print('Classificaiton report NB: ')
print(cr_gnb)
print('Classificaiton report KNN: ')
print(cr_knn)
print('Classificaiton report DT: ')
print(cr_dt)
print('Classificaiton report RF: ')
print(cr_rf)
print('Classificaiton report GB: ')
print(cr_gb)

If we look the confusion matrix of the different probabilistic classifiers we can ensure that the better performance is the one of the LDA.

# Experimentation

In this last section, the best classifiers performance is compared applying both PCA and Correlation dimensionality reduction (run only the one chosen on the "Dimensionality reduction" section).

## PCA

In the case of the PCA dimensionality reduction the fourth best classifiers are the following: KNN, DT, RF and GB.

### KNN

In [None]:
dfknn = pd.DataFrame(Xtest[:,0:8],columns=['PCA1', 'PCA2','PCA3', 'PCA4','PCA5', 'PCA6','PCA7','PCA8'])

fig = plt.figure(figsize=(8,8))

dfknn['label'] = ypred_knn

ax_knn = sns.pairplot(data=dfknn, hue= 'label')
plt.show()

### DT

In [None]:
dfdt = pd.DataFrame(Xtest[:,0:8],columns=['PCA1', 'PCA2','PCA3', 'PCA4','PCA5', 'PCA6','PCA7','PCA8'])

fig = plt.figure(figsize=(8,8))

dfdt['label'] = ypred_dt

ax_dt = sns.pairplot(data=dfdt, hue= 'label')
plt.show()

### RF

In [None]:
dfrf = pd.DataFrame(Xtest[:,0:8],columns=['PCA1', 'PCA2','PCA3', 'PCA4','PCA5', 'PCA6','PCA7','PCA8'])

fig = plt.figure(figsize=(8,8))

dfrf['label'] = ypred_rf

ax_rf = sns.pairplot(data=dfrf, hue= 'label')
plt.show()

### GB

In [None]:
dfgb = pd.DataFrame(Xtest[:,0:8],columns=['PCA1', 'PCA2','PCA3', 'PCA4','PCA5', 'PCA6','PCA7','PCA8'])

fig = plt.figure(figsize=(8,8))

dfgb['label'] = ypred_gb

ax_gb = sns.pairplot(data=dfgb, hue= 'label')
plt.show()

## Correlation

In the case of the Correlation dimensionality reduction the fourth best classifiers are the following: KNN, DT, RF and GB.

### Original

In [None]:
dftrue = pd.DataFrame(Xtest,columns=['AltitudeMeters_max', 'HeartRatebpm_max', 'Speed_max', 'Speed_min',
       'RunCadence_min', 'gps_speed_min', 'gps_acc_min', 'gps_acc_mean',
       'gps_acc_q2', 'gps_acc_q3'])

fig = plt.figure(figsize=(8,8))

dftrue['label'] = ytest

sns.pairplot(data=dftrue, hue = 'label')
plt.show()

In [None]:
dftrue

### KNN

First it is necessary to convert the numpy predicted array to a pandas dataframe.

In [None]:
dfknn = pd.DataFrame(Xtest,columns=['AltitudeMeters_max', 'HeartRatebpm_max', 'Speed_max', 'Speed_min',
       'RunCadence_min', 'gps_speed_min', 'gps_acc_min', 'gps_acc_mean',
       'gps_acc_q2', 'gps_acc_q3'])

fig = plt.figure(figsize=(8,8))

dfknn['label'] = ypred_knn

ax_knn = sns.pairplot(data=dfknn, hue= 'label')
plt.show()

### DT

In [None]:
dfdt= pd.DataFrame(Xtest,columns=['AltitudeMeters_max', 'HeartRatebpm_max', 'Speed_max', 'Speed_min',
       'RunCadence_min', 'gps_speed_min', 'gps_acc_min', 'gps_acc_mean',
       'gps_acc_q2', 'gps_acc_q3'])

fig = plt.figure(figsize=(8,8))

dfdt['label'] = ypred_dt

ax_dt = sns.pairplot(data=dfdt, hue = 'label')
plt.show()

### RF

In [None]:
dfrf= pd.DataFrame(Xtest,columns=['AltitudeMeters_max', 'HeartRatebpm_max', 'Speed_max', 'Speed_min',
       'RunCadence_min', 'gps_speed_min', 'gps_acc_min', 'gps_acc_mean',
       'gps_acc_q2', 'gps_acc_q3'])

fig = plt.figure(figsize=(8,8))

dfrf['label'] = ypred_rf

ax_rf = sns.pairplot(data=dfrf, hue = 'label')
plt.show()

In [None]:
ypred_rf

### GB

In [None]:
dfgb = pd.DataFrame(Xtest,columns=['AltitudeMeters_max', 'HeartRatebpm_max', 'Speed_max', 'Speed_min',
       'RunCadence_min', 'gps_speed_min', 'gps_acc_min', 'gps_acc_mean',
       'gps_acc_q2', 'gps_acc_q3'])

fig = plt.figure(figsize=(8,8))

dfgb['label'] = ypred_gb

ax_gb = sns.pairplot(data=dfgb, hue = 'label')
plt.show()