## Importing Required Modules and Loading the Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from pprint import pprint
import seaborn as sns
df=pd.read_csv('./data.csv')
df.head(5)

#### The dataset has features named like alx, gry etc. These can be read as
a: accelerometer<br>
g: gyroscope<br>
l: left ankle<br>
r: right wrist<br>
x: digital value of the analog movement captured in the x-direction<br>
y: digital value of the analog movement captured in the y-direction<br>
z: digital value of the analog movement captured in the z-direction

#### Subject referes to the person on which the data has been captured. There are a total of 10 subjects according to this dataset

###  Activity is a Categorical Feature, following are its corresponding categories
0 indicates None<br>
    1 indicates Standing still<br>
    2 indicates Sitting and relaxing<br>
    3 indicates Lying down<br>
    4 indicates Walking<br>
    5 indicates Climbing stairs<br>
    6 indicates Waist bends forward <br>
    7 indicates Frontal elevation of arms <br>
    8 indicates Knees bending (crouching) <br>
    9 indicates Cycling<br>
    10 indicates Jogging<br>
    11 indicates Running<br>
    12 indicates Jump front & back 

In [None]:
df.info()

In [None]:
df.describe()

#### List of Number of NaN/Null values in each column

In [None]:
df.isnull().sum()

## Dataset Analysis

### Bar Plot describing the Number of values pertaining to each activity

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='Activity', data=df)
plt.show()

## Visualising the Target Label - Activity

In [None]:
plt.figure(figsize=(12,8))
round(df["Activity"].value_counts()/df.shape[0]*100,2).plot.pie(autopct= '%2.1f%%')

### From the above, it is clearly observed that "activity" feature is highly oversampled for Activity 0.

In [None]:
df['Activity'].value_counts()

In [None]:
sns.violinplot(data=df['Activity'])

### Re-Sampling of Activity Column to bring uniformity for all categories of activities

In [None]:
activity_0 = df[df['Activity'] == 0]
activity_non_zero = df[df['Activity'] != 0]

activity_0 = activity_0.sample(n=45000, random_state=7)
df = pd.concat([activity_0, activity_non_zero])

In [None]:
df['Activity'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
round(df["Activity"].value_counts()/df.shape[0]*100,2).plot.pie(autopct= '%2.1f%%')

### As a result of sampling, there is a balance among the categories of the Activity Feature

### Correlation Heatmap of all the features in the dataset

In [None]:
import seaborn as sns
plt.figure(figsize = (15,15))
sns.heatmap(df.corr(), annot = True)

## Histogram for each column to understand the distribution of values

In [None]:
for i in df.columns:
    sns.histplot(df[i])
    plt.title(i)
    plt.show()

### Plots that visualize the sensor data in a more interactive way

In [None]:
import plotly.graph_objects as go

activity = ["None,","Standing_still","Sitting_and_relaxing","Lying_down","Walking","Climbing_stairs","Waist_bends_forward","Frontal_elevation_of_arms","Knees_bending_(crouching)","Cycling","Jogging","Running"," Jump_front_&_back"]
sensors = ['accelerometer', 'gyro']

for i in range(1, 13):
    for s in sensors:
        print(activity[i] + " - " + s)
        fig = go.Figure()
        
        fig.add_trace(go.Scatter(x=df[df['Activity']==i].reset_index(drop=True).index, y=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'lx'], name=s[0] + 'lx', line=dict(color='red', width=2)))
        fig.add_trace(go.Scatter(x=df[df['Activity']==i].reset_index(drop=True).index, y=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'ly'], name=s[0] + 'ly', line=dict(color='green', width=2)))
        fig.add_trace(go.Scatter(x=df[df['Activity']==i].reset_index(drop=True).index, y=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'lz'], name=s[0] + 'lz', line=dict(color='blue', width=2)))
        fig.update_layout(title_text='Left ankle sensor')
        fig.show()
        
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=df[df['Activity']==i].reset_index(drop=True).index, y=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'rx'], name=s[0] + 'rx', line=dict(color='red', width=2)))
        fig.add_trace(go.Scatter(x=df[df['Activity']==i].reset_index(drop=True).index, y=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'ry'], name=s[0] + 'ry', line=dict(color='green', width=2)))
        fig.add_trace(go.Scatter(x=df[df['Activity']==i].reset_index(drop=True).index, y=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'rz'], name=s[0] + 'rz', line=dict(color='blue', width=2)))
        fig.update_layout(title_text='Right wrist sensor')
        fig.show()


#### From the above plots, activities involving high movement of Leg/Wrist have spikes and drops in the measurments which is a clear indication of rigorous movements. 

### Understanding the Distribution of Each of the activity data recorded by the sensor using Histograms

In [None]:

import plotly.subplots as sp

figs = []
for i in range(1, 13):
    for s in sensors:
        fig = sp.make_subplots(rows=1, cols=2, 
                               subplot_titles=["Left ankle sensor ({})".format(s), "Right wrist sensor ({})".format(s)], 
                               shared_yaxes=True)
        fig.add_trace(
            go.Histogram(x=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'lx'], 
                         histnorm='probability density',
                         name=s[0] + 'lx',
                         marker_color='red'),
            row=1, col=1
        )
        fig.add_trace(
            go.Histogram(x=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'ly'], 
                         histnorm='probability density',
                         name=s[0] + 'ly',
                         marker_color='green'),
            row=1, col=1
        )
        fig.add_trace(
            go.Histogram(x=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'lz'], 
                         histnorm='probability density',
                         name=s[0] + 'lz',
                         marker_color='blue'),
            row=1, col=1
        )
        fig.add_trace(
            go.Histogram(x=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'rx'], 
                         histnorm='probability density',
                         name=s[0] + 'rx',
                         marker_color='red'),
            row=1, col=2
        )
        fig.add_trace(
            go.Histogram(x=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'ry'], 
                         histnorm='probability density',
                         name=s[0] + 'ry',
                         marker_color='green'),
            row=1, col=2
        )
        fig.add_trace(
            go.Histogram(x=df[df['Activity']==i].reset_index(drop=True)[s[0] + 'rz'], 
                         histnorm='probability density',
                         name=s[0] + 'rz',
                         marker_color='blue'),
            row=1, col=2
        )
        fig.update_layout(title_text=activity[i])
        figs.append(fig)

for fig in figs:
    fig.show()


#### Most of the data seems to have a near to normal distribution (bell-curve), which is really advantageous for developing an efficient model to predict the human activity

## Seperating Dependent and Independent (Target variable) from the dataframe

In [None]:
temp=df.columns[-1]
df['Activity_']=df['Activity']
df.drop(columns=['Activity','subject'],inplace=True)

In [None]:
x=df.iloc[:,:-1]
y=df['Activity_']

In [None]:
import numpy as np
x=np.array(x)
y=np.array(y)

### Performing Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=7)

### Scaling using Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler    
s= StandardScaler()    
x_train= s.fit_transform(x_train)    
x_test= s.transform(x_test) 

### Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
l=LogisticRegression()
l.fit(x_train,y_train)

In [None]:
l.score(x_test,y_test)

In [None]:
pprint(classification_report(y_test,l.predict(x_test)))

### KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
classifier= KNeighborsClassifier(n_neighbors=5, p=2 )  
classifier.fit(x_train, y_train) 

In [None]:
classifier.score(x_test,y_test)

In [None]:
from pprint import pprint
pprint(classification_report(y_test,classifier.predict(x_test)))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
m = RandomForestClassifier()

In [None]:
m.fit(x_train,y_train)

In [None]:
m.score(x_test,y_test)

In [None]:
pprint(classification_report(y_test,m.predict(x_test)))

### RandomForest Feature Importance (Calculated based on Impurity of each feature)

In [None]:
f_imp=abs(m.feature_importances_)
sns.barplot(y=f_imp,x=[i for i in df.columns[:-1]])