## 1. Import Libraries and Load raw data

In [10]:
import pandas as pd
import numpy as np
import re

# Load the dataset
df = pd.read_csv('../../data/raw/Obesity-rates.csv')

# Display first few rows
print("ðŸ”¹ Preview of Dataset:")
display(df.head())

# Display column names
print("\nðŸ”¹ Column Names:")
display(df.columns)

# Check for duplicate rows
print("\nðŸ”¹ Number of Duplicate Rows:")
display(df.duplicated().sum())

# Check for missing values per column
print("\nðŸ”¹ Missing Values in Each Column:")
display(df.isnull().sum())

# Show dataset info (data types, memory usage, etc.)
print("\nðŸ”¹ Dataset Info:")
df.info()

# Summary statistics for numerical columns
print("\nðŸ”¹ Statistical Summary:")
display(df.describe())


ðŸ”¹ Preview of Dataset:


Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2000,2000,%,4.3,X,Figure from external organization,
1,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2001,2001,%,4.6,X,Figure from external organization,
2,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2002,2002,%,5.0,X,Figure from external organization,
3,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2003,2003,%,5.4,X,Figure from external organization,
4,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2004,2004,%,5.8,X,Figure from external organization,



ðŸ”¹ Column Names:


Index(['Domain Code', 'Domain', 'Area Code (M49)', 'Area', 'Element Code',
       'Element', 'Item Code', 'Item', 'Year Code', 'Year', 'Unit', 'Value',
       'Flag', 'Flag Description', 'Note'],
      dtype='object')


ðŸ”¹ Number of Duplicate Rows:


np.int64(0)


ðŸ”¹ Missing Values in Each Column:


Domain Code            0
Domain                 0
Area Code (M49)        0
Area                   0
Element Code           0
Element                0
Item Code              0
Item                   0
Year Code              0
Year                   0
Unit                   0
Value                  0
Flag                   0
Flag Description       0
Note                4554
dtype: int64


ðŸ”¹ Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4554 entries, 0 to 4553
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Domain Code       4554 non-null   object 
 1   Domain            4554 non-null   object 
 2   Area Code (M49)   4554 non-null   int64  
 3   Area              4554 non-null   object 
 4   Element Code      4554 non-null   int64  
 5   Element           4554 non-null   object 
 6   Item Code         4554 non-null   int64  
 7   Item              4554 non-null   object 
 8   Year Code         4554 non-null   int64  
 9   Year              4554 non-null   int64  
 10  Unit              4554 non-null   object 
 11  Value             4554 non-null   float64
 12  Flag              4554 non-null   object 
 13  Flag Description  4554 non-null   object 
 14  Note              0 non-null      float64
dtypes: float64(2), int64(5), object(8)
memory usage: 533.8+ KB

ðŸ”¹ Stat

Unnamed: 0,Area Code (M49),Element Code,Item Code,Year Code,Year,Value,Note
count,4554.0,4554.0,4554.0,4554.0,4554.0,4554.0,0.0
mean,426.934343,6121.0,21042.0,2011.0,2011.0,19.567479,
std,255.896239,0.0,0.0,6.633978,6.633978,13.52651,
min,4.0,6121.0,21042.0,2000.0,2000.0,0.3,
25%,204.0,6121.0,21042.0,2005.0,2005.0,9.5,
50%,420.0,6121.0,21042.0,2011.0,2011.0,18.2,
75%,646.0,6121.0,21042.0,2017.0,2017.0,25.5,
max,894.0,6121.0,21042.0,2022.0,2022.0,75.2,


## 2. Make a working copy

In [22]:
df_cleaned = df.copy()

print("Working copy created successfully!")
display(df_cleaned.head())

Working copy created successfully!


Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2000,2000,%,4.3,X,Figure from external organization,
1,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2001,2001,%,4.6,X,Figure from external organization,
2,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2002,2002,%,5.0,X,Figure from external organization,
3,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2003,2003,%,5.4,X,Figure from external organization,
4,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21042,Prevalence of obesity in the adult population ...,2004,2004,%,5.8,X,Figure from external organization,


## 3. Standardize column names

In [23]:
df_cleaned = df_cleaned.drop(columns=['Domain Code', 'Domain', 'Area Code (M49)', 'Element Code', 'Element', 'Item Code', 'Item', 'Year Code', 'Unit', 'Flag', 'Flag Description', 'Note'])
df_cleaned.head()

Unnamed: 0,Area,Year,Value
0,Afghanistan,2000,4.3
1,Afghanistan,2001,4.6
2,Afghanistan,2002,5.0
3,Afghanistan,2003,5.4
4,Afghanistan,2004,5.8


## 4. Handle outliers

In [24]:
# Calculate Q1, Q3 and IQR for the obesity values
Q1 = df_cleaned['Value'].quantile(0.25)
Q3 = df_cleaned['Value'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper limits
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {upper_bound}")

# Identify outliers
outliers = df_cleaned[(df_cleaned['Value'] < lower_bound) | (df_cleaned['Value'] > upper_bound)]
print("\n Detected Outliers:")
display(outliers.head(100))

# Option: Cap outliers instead of removing them
df_cleaned['Value'] = df_cleaned['Value'].clip(lower=lower_bound, upper=upper_bound)

print("\n Outliers handled by capping extreme values.")


Lower Bound: -14.5
Upper Bound: 49.5

 Detected Outliers:


Unnamed: 0,Area,Year,Value
69,American Samoa,2000,68.4
70,American Samoa,2001,68.8
71,American Samoa,2002,69.1
72,American Samoa,2003,69.4
73,American Samoa,2004,69.7
...,...,...,...
3503,Samoa,2007,52.0
3504,Samoa,2008,52.8
3505,Samoa,2009,53.5
3506,Samoa,2010,54.3



 Outliers handled by capping extreme values.


## 10. Sort & organize data

## 11. Save cleaned data

In [11]:
output_file_path = '../../data/processed/cleaned-obesity-rates.csv'
df.to_csv(output_file_path, index=False)
print(f"Cleaned data saved to {output_file_path}")

Cleaned data saved to ../../data/processed/cleaned-obesity-rates.csv
