In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('df_download.csv')

In [3]:
display(df)

Unnamed: 0,Session_ID,Date,Activity,Heart_Rate_BPM,Duration_Min,Calories
0,500,15-Jan-2026,Gym,69.0,42.0,466.0
1,501,2026/01/15,Yoga,160.0,33.0,334.0
2,502,2026-01-15,Swim,75.0,37.0,516.0
3,503,2026-01-15,Run,124.0,38.0,437.0
4,504,2026-01-15,Cycle,173.0,26.0,564.0
5,505,2026-01-15,Ggym,88.0,43.0,141.0
6,506,2026-01-15,YOGA,149.0,89.0,190.0
7,507,2026-01-15,swim,153.0,24.0,368.0
8,508,2026-01-15,Run,172.0,30.0,598.0
9,509,2026-01-15,cykle,89.0,33.0,386.0


In [4]:
display(df.head())

Unnamed: 0,Session_ID,Date,Activity,Heart_Rate_BPM,Duration_Min,Calories
0,500,15-Jan-2026,Gym,69.0,42.0,466.0
1,501,2026/01/15,Yoga,160.0,33.0,334.0
2,502,2026-01-15,Swim,75.0,37.0,516.0
3,503,2026-01-15,Run,124.0,38.0,437.0
4,504,2026-01-15,Cycle,173.0,26.0,564.0


In [5]:
display(df.tail())

Unnamed: 0,Session_ID,Date,Activity,Heart_Rate_BPM,Duration_Min,Calories
48,548,2026-01-15,Run,117.0,65.0,-150.0
49,549,2026-01-15,Cycle,96.0,68.0,640.0
50,502,2026-01-15,Swim,75.0,37.0,516.0
51,502,2026-01-15,Swim,75.0,37.0,516.0
52,549,2026-01-15,Cycle,96.0,68.0,640.0


In [6]:
# list columns
print(list(df.columns))

['Session_ID', 'Date', 'Activity', 'Heart_Rate_BPM', 'Duration_Min', 'Calories']


In [7]:
# data type information
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Session_ID      53 non-null     int64  
 1   Date            53 non-null     str    
 2   Activity        53 non-null     str    
 3   Heart_Rate_BPM  48 non-null     float64
 4   Duration_Min    53 non-null     float64
 5   Calories        50 non-null     float64
dtypes: float64(3), int64(1), str(2)
memory usage: 2.6 KB


In [8]:
# show data and identify inconsistencies
df.describe()

Unnamed: 0,Session_ID,Heart_Rate_BPM,Duration_Min,Calories
count,53.0,48.0,53.0,50.0
mean,524.113208,151.708333,84.150943,385.66
std,15.20205,172.692687,192.312086,199.759407
min,500.0,0.0,15.0,-250.0
25%,511.0,89.75,30.0,232.0
50%,524.0,115.5,43.0,450.5
75%,537.0,154.75,65.0,521.75
max,549.0,999.0,1080.0,683.0


In [9]:
# identify unique entries for normalization / cleaning
print(df['Activity'].unique())

<StringArray>
['Gym', 'Yoga', 'Swim', 'Run', 'Cycle', 'Ggym', 'YOGA', ' swim ', ' cykle ']
Length: 9, dtype: str


In [11]:
num_duplicates = df.duplicated().sum()
print(f"Number of Duplicated Rows: {num_duplicates}")

Number of Duplicated Rows: 3


In [15]:
display(df[df.duplicated(keep=False)])

Unnamed: 0,Session_ID,Date,Activity,Heart_Rate_BPM,Duration_Min,Calories
2,502,2026-01-15,Swim,75.0,37.0,516.0
49,549,2026-01-15,Cycle,96.0,68.0,640.0
50,502,2026-01-15,Swim,75.0,37.0,516.0
51,502,2026-01-15,Swim,75.0,37.0,516.0
52,549,2026-01-15,Cycle,96.0,68.0,640.0


In [18]:
# check missing values
df.isnull()

Unnamed: 0,Session_ID,Date,Activity,Heart_Rate_BPM,Duration_Min,Calories
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [19]:
# check number of missing values
df.isnull().sum()

Session_ID        0
Date              0
Activity          0
Heart_Rate_BPM    5
Duration_Min      0
Calories          3
dtype: int64

In [21]:
# remove the duplicates
df.drop_duplicates(inplace=True)

In [22]:
# check number of duplicate rows again
num_duplicates = df.duplicated().sum()
print(f"Number of Duplicated Rows: {num_duplicates}")

Number of Duplicated Rows: 0


In [23]:
# strip() removes leading/trailing spaces, title() capitalizes only the first letter
df['Activity'] = df['Activity'].str.strip().str.title()

In [24]:
print(df['Activity'].unique())

<StringArray>
['Gym', 'Yoga', 'Swim', 'Run', 'Cycle', 'Ggym', 'Cykle']
Length: 7, dtype: str


In [25]:
# Using a dictionary to map specific types to correct categories
corrections = {'Ggym': 'Gym', 'Cykle': 'Cycle'}
df['Activity'] = df['Activity'].replace(corrections)

In [26]:
print(df['Activity'].unique())

<StringArray>
['Gym', 'Yoga', 'Swim', 'Run', 'Cycle']
Length: 5, dtype: str


In [29]:
# get Standardized Date
df['Date'] = pd.to_datetime(df['Date'], format='mixed')

In [30]:
df['Date']

0    2026-01-15
1    2026-01-15
2    2026-01-15
3    2026-01-15
4    2026-01-15
5    2026-01-15
6    2026-01-15
7    2026-01-15
8    2026-01-15
9    2026-01-15
10   2026-01-15
11   2026-01-15
12   2026-01-15
13   2026-01-15
14   2026-01-15
15   2026-01-15
16   2026-01-15
17   2026-01-15
18   2026-01-15
19   2026-01-15
20   2026-01-15
21   2026-01-15
22   2026-01-15
23   2026-01-15
24   2026-01-15
25   2026-01-15
26   2026-01-15
27   2026-01-15
28   2026-01-15
29   2026-01-15
30   2026-01-15
31   2026-01-15
32   2026-01-15
33   2026-01-15
34   2026-01-15
35   2026-01-15
36   2026-01-15
37   2026-01-15
38   2026-01-15
39   2026-01-15
40   2026-01-15
41   2026-01-15
42   2026-01-15
43   2026-01-15
44   2026-01-15
45   2026-01-15
46   2026-01-15
47   2026-01-15
48   2026-01-15
49   2026-01-15
Name: Date, dtype: datetime64[us]

In [32]:
# identifying outliers
# IQR method uses middle 50% of data and best for skewed distributions
# Z score method measures how many standard dev a value is from the mean, best for normal distributions

# IQR implementation
Q1 = df['Heart_Rate_BPM'].quantile(0.25)
Q3 = df['Heart_Rate_BPM'].quantile(0.75)
IQR = Q3 - Q1

# define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# detect outliers
iqr_outliers = df[(df['Heart_Rate_BPM'] < lower_bound) | (df['Heart_Rate_BPM'] > upper_bound)]
print("IQR Outliers:", iqr_outliers['Heart_Rate_BPM'].tolist())

IQR Outliers: [899.0, 999.0]


In [33]:
df.isnull()

Unnamed: 0,Session_ID,Date,Activity,Heart_Rate_BPM,Duration_Min,Calories
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False
