In [None]:
# Step	Task
# 1	Load raw data
# 2	Make a working copy
# 3	Standardize column names
# 4	Fix data types
# 5	Handle missing values
# 6	Remove duplicates
# 7	Fix inconsistent values
# 8	Handle outliers
# 9	Rename columns consistently
# 10 Sort & organize data
# 11 Save cleaned data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../../data/raw/Diabetes-prevalence.csv")
display(df.head())

In [None]:
display(df.info())

In [None]:
display(df.describe())

In [None]:
df_clean = df.copy()

In [None]:
initial_rows = df_clean.shape[0]
print(f"Number of rows before removing duplicates: {initial_rows}")

duplicates_count = df_clean.duplicated().sum()
print(f"Number of duplicate rows found: {duplicates_count}")

if duplicates_count > 0:
    df_clean.drop_duplicates(inplace=True)
    print("Duplicate rows removed successfully.")
else:
    print("No duplicate rows found.")

final_rows = df_clean.shape[0]
print(f"Number of rows after removing duplicates: {final_rows}")

In [None]:
object_cols = df_clean.select_dtypes(include='object').columns
for col in object_cols:
    unique_count = df_clean[col].nunique()
    print(f"Column '{col}': {unique_count} unique values (out of {len(df_clean)} rows)")

In [None]:
# Convert 'sex' to categorical type if it has a small number of unique values
df_clean['sex'] = df_clean['sex'].astype('category')

In [36]:
missing_values = df_clean.isnull().sum()
print("Number of missing values in df_cleaned: ")
print(missing_values)

Number of missing values in df_cleaned: 
Country/Region/World              0
ISO                               0
Sex                               0
Year                              0
Crude diabetes prevalence         0
Lower 95% uncertainty interval    0
Upper 95% uncertainty interval    0
dtype: int64


In [37]:
# Droping unessessary columns
df_cleaned = df_clean.drop(['Lower 95% uncertainty interval', 'Upper 95% uncertainty interval', 'ISO'], axis=1)

In [41]:
df_cleaned = df_cleaned.pivot_table(
    index=["Country/Region/World", "Year"],
    columns="Sex",
    values="Crude diabetes prevalence"
).reset_index()

In [43]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country/Region/World  6825 non-null   object 
 1   Year                  6825 non-null   int64  
 2   Men                   6825 non-null   float64
 3   Women                 6825 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 213.4+ KB


In [47]:
df_cleaned.head()

Sex,Country/Region/World,Year,Men,Women
0,Afghanistan,1980,0.028291,0.036917
1,Afghanistan,1981,0.028931,0.037609
2,Afghanistan,1982,0.029537,0.038284
3,Afghanistan,1983,0.030114,0.038955
4,Afghanistan,1984,0.030696,0.039642


In [53]:
df_cleaned.rename(columns={"Country/Region/World": "Country"}, inplace=True)
print("Column names updated. Displaying new column names:")
print(df_cleaned.columns)

Column names updated. Displaying new column names:
Index(['Country', 'Year', 'Men', 'Women'], dtype='object')


In [54]:
output_file_path = '../../data/processed/cleaned-diabetes-prevalence.csv'
df_cleaned.to_csv(output_file_path, index=False)
print(f"Cleaned data saved to {output_file_path}")

Cleaned data saved to ../../data/processed/cleaned-diabetes-prevalence.csv
