# Anomaly Detection

This notebook explores outliers in a dataset of gym users.

## Imports

In [129]:
import pandas as pd
import altair as alt
import numpy as np
from scipy.stats import zscore

## Data Sourcing

In [130]:
users = pd.read_csv("gym_unclean.csv")

In [131]:
users.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Avg_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,BMI
0,53,Female,73.5,1.55,153,1.17,806.0,HIIT,30.59
1,47,Female,43.4,1.62,132,0.51,303.0,Cardio,16.54
2,18,Female,64.8,1.53,141,1.98,1396.0,Ykoga,27.68
3,37,Female,55.6,1.57,123,0.97,597.0,Cardio,22.56
4,40,Male,68.9,1.99,149,1.16,951.0,,17.4


## Pre-Work questions

1. What are the oldest and youngest ages in the dataset?

In [132]:
users["Age"].min(), users["Age"].max()

(np.int64(-168), np.int64(248))

2. What is the average weight?

In [133]:
print(users["Weight (kg)"].median())  # Less sensitive to outliers
print(users["Weight (kg)"].mean())
print(users["Weight (kg)"].mode())  # Most common

69.80000000000001
131.1746223564955
0    57.7
Name: Weight (kg), dtype: float64


3. How many rows are in the data?

In [134]:
users.shape[0]

1324

4. What is the second largest BMI in the dataset?

In [135]:
users["BMI"].sort_values(ascending=False).nlargest(2)

1122    49.84
292     48.43
Name: BMI, dtype: float64

## Anomaly Detection

In [136]:
users.describe()

Unnamed: 0,Age,Weight (kg),Height (m),Avg_BPM,Session_Duration (hours),Calories_Burned,BMI
count,1324.0,1324.0,1324.0,1324.0,1324.0,1129.0,1324.0
mean,38.486405,131.174622,1.723165,143.861782,1.332509,908.844996,24.868104
std,22.60253,679.087829,0.129968,14.356799,1.051352,270.07668,6.587098
min,-168.0,40.0,1.5,120.0,0.024549,303.0,12.32
25%,28.0,58.2,1.62,132.0,1.03,723.0,20.19
50%,39.0,69.8,1.71,143.0,1.26,895.0,24.175
75%,50.0,86.2,1.81,157.0,1.47,1081.0,28.49
max,248.0,10790.0,2.0,169.0,26.453492,1783.0,49.84


In [137]:
a = [1, 3.5, 1, 1, 1]
print(np.mean(a))
print(np.std(a))
print(zscore(a))

1.5
1.0
[-0.5  2.  -0.5 -0.5 -0.5]


In [138]:
zscore([1, 1, 2, 1])

array([-0.57735027, -0.57735027,  1.73205081, -0.57735027])

In [139]:
users["Age"].quantile(0.25)

np.float64(28.0)

## Anomaly Detection

In [140]:
users.describe()

Unnamed: 0,Age,Weight (kg),Height (m),Avg_BPM,Session_Duration (hours),Calories_Burned,BMI
count,1324.0,1324.0,1324.0,1324.0,1324.0,1129.0,1324.0
mean,38.486405,131.174622,1.723165,143.861782,1.332509,908.844996,24.868104
std,22.60253,679.087829,0.129968,14.356799,1.051352,270.07668,6.587098
min,-168.0,40.0,1.5,120.0,0.024549,303.0,12.32
25%,28.0,58.2,1.62,132.0,1.03,723.0,20.19
50%,39.0,69.8,1.71,143.0,1.26,895.0,24.175
75%,50.0,86.2,1.81,157.0,1.47,1081.0,28.49
max,248.0,10790.0,2.0,169.0,26.453492,1783.0,49.84


Quantile: a division of the data with an equal number of data points to comparable  
Standard deviation (std) : how far stuff is spread away from the mean

In [141]:
users[(users["Age"] > users["Age"].quantile(0.05)) & (users["Age"] > users["Age"].quantile(0.95))]

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Avg_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,BMI
12,58,Female,56.5,1.70,122,0.74,406.0,,19.55
16,59,Male,85.5,1.67,144,1.99,1418.0,Cradio,30.66
32,58,Male,82.2,1.87,143,1.57,1111.0,HIIT,23.51
61,248,Male,87.5,1.88,134,1.19,789.0,Yoga,24.76
75,58,Female,63.0,1.78,154,1.58,1095.0,HIIT,19.88
...,...,...,...,...,...,...,...,...,...
1214,59,Female,42.1,1.65,132,1.22,725.0,Strength,15.46
1239,58,Male,74.6,1.75,137,1.09,739.0,HIIT,24.36
1250,157,Female,78.0,1.68,135,1.29,871.0,HIIT,27.64
1258,58,Female,65.2,1.63,121,1.12,610.0,HIIT,24.54


In [142]:
users["age_z"] = zscore(users["Age"])

In [143]:
users[users["age_z"] > 3]

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Avg_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,BMI,age_z
61,248,Male,87.5,1.88,134,1.19,789.0,Yoga,24.76,9.272978
407,140,Female,46.6,1.5,165,1.38,1025.0,Strength,20.71,4.492946
539,226,Male,64.5,1.79,130,1.17,753.0,Yoga,20.13,8.299268
595,204,Female,69.6,1.58,135,1.49,1006.0,Cardio,27.88,7.325558
604,172,Female,59.1,1.72,148,1.07,792.0,Cardio,19.98,5.909252
676,203,Male,82.5,1.79,139,1.69,1163.0,Cardio,25.75,7.281298
856,110,Male,87.6,1.93,139,1.72,1315.0,HIIT,23.52,3.16516
1034,211,Male,47.1,1.78,149,1.26,929.0,Yoga,14.87,7.635375
1250,157,Female,78.0,1.68,135,1.29,871.0,HIIT,27.64,5.245359


## Visual

In [144]:
alt.Chart(users).mark_boxplot().encode(
    x="Age",
    y="Gender",
    tooltip=["Age", "Gender"]
).properties(
    height=200,
    width=500
)

In [145]:
alt.Chart(users).mark_point().encode(
    x=alt.X("Height (m)").scale(zero=False),
    y="Weight (kg)"
)

## Cleaning

In [146]:
users = users[(users["Age"] > 0) & (users["Age"] < 117)]
users = users[zscore(users["Weight (kg)"]) < 3]
users = users[zscore(users["Height (m)"]) < 3]

In [147]:
users

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Avg_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,BMI,age_z
0,53,Female,73.5,1.55,153,1.17,806.0,HIIT,30.59,0.642365
1,47,Female,43.4,1.62,132,0.51,303.0,Cardio,16.54,0.376808
2,18,Female,64.8,1.53,141,1.98,1396.0,Ykoga,27.68,-0.906719
3,37,Female,55.6,1.57,123,0.97,597.0,Cardio,22.56,-0.065788
4,40,Male,68.9,1.99,149,1.16,951.0,,17.40,0.066991
...,...,...,...,...,...,...,...,...,...,...
1319,20,Male,55.0,1.60,168,1.12,1035.0,,21.48,-0.818200
1320,31,Male,86.6,1.76,151,1.29,1071.0,Strength,27.96,-0.331345
1321,18,Female,72.7,1.55,125,0.84,525.0,,30.26,-0.906719
1322,59,Female,70.7,1.68,121,1.17,637.0,Cardio,25.05,0.907923


In [148]:
alt.Chart(users).mark_point().encode(
    x=alt.X("Height (m)").scale(zero=False),
    y="Weight (kg)"
)

In [151]:
alt.Chart(users).mark_boxplot().encode(
    x="Calories_Burned",
    y="Gender",
).properties(
    height=200,
    width=500
)

In [155]:
alt.Chart(users).mark_point().encode(
    x="Calories_Burned",
    y="Session_Duration (hours)",
    color="Workout_Type",
    tooltip=["Calories_Burned", "Session_Duration (hours)", "Workout_Type"]
)