## 1. Load Raw Data

In [7]:
import pandas as pd

# Load the diabetes prevalence dataset
df = pd.read_csv("../../data/raw/Diabetes-prevalence.csv")

# Preview the first few rows
print("Preview of Dataset:")
display(df.head())

# Dataset info (columns, types, memory)
print("\nDataset Info:")
df.info()

# Summary statistics for numerical columns
print("\nSummary Statistics:")
display(df.describe())


Preview of Dataset:


Unnamed: 0,Country/Region/World,ISO,Sex,Year,Crude diabetes prevalence,Lower 95% uncertainty interval,Upper 95% uncertainty interval
0,Afghanistan,AFG,Men,1980,0.028291,0.008573,0.064234
1,Afghanistan,AFG,Men,1981,0.028931,0.009486,0.062902
2,Afghanistan,AFG,Men,1982,0.029537,0.010453,0.062017
3,Afghanistan,AFG,Men,1983,0.030114,0.011449,0.061258
4,Afghanistan,AFG,Men,1984,0.030696,0.01239,0.060015



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13650 entries, 0 to 13649
Data columns (total 7 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country/Region/World            13650 non-null  object 
 1   ISO                             13650 non-null  object 
 2   Sex                             13650 non-null  object 
 3   Year                            13650 non-null  int64  
 4   Crude diabetes prevalence       13650 non-null  float64
 5   Lower 95% uncertainty interval  13650 non-null  float64
 6   Upper 95% uncertainty interval  13650 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 746.6+ KB

Summary Statistics:


Unnamed: 0,Year,Crude diabetes prevalence,Lower 95% uncertainty interval,Upper 95% uncertainty interval
count,13650.0,13650.0,13650.0,13650.0
mean,1997.0,0.064005,0.037929,0.099751
std,10.099875,0.035022,0.025618,0.046277
min,1980.0,0.005387,0.000793,0.017785
25%,1988.0,0.041179,0.020971,0.070211
50%,1997.0,0.059015,0.033705,0.092593
75%,2006.0,0.077626,0.048545,0.117512
max,2014.0,0.297842,0.20604,0.43633


## 2. Make a Working Copy

In [8]:
df_clean = df.copy()

print("Working copy created successfully!")
display(df_clean.head())


Working copy created successfully!


Unnamed: 0,Country/Region/World,ISO,Sex,Year,Crude diabetes prevalence,Lower 95% uncertainty interval,Upper 95% uncertainty interval
0,Afghanistan,AFG,Men,1980,0.028291,0.008573,0.064234
1,Afghanistan,AFG,Men,1981,0.028931,0.009486,0.062902
2,Afghanistan,AFG,Men,1982,0.029537,0.010453,0.062017
3,Afghanistan,AFG,Men,1983,0.030114,0.011449,0.061258
4,Afghanistan,AFG,Men,1984,0.030696,0.01239,0.060015


## 3: Remove Duplicate Rows

In [10]:
initial_rows = df_clean.shape[0]
print(f"Number of rows before removing duplicates: {initial_rows}")

duplicates_count = df_clean.duplicated().sum()
print(f"Number of duplicate rows found: {duplicates_count}")

if duplicates_count > 0:
    df_clean.drop_duplicates(inplace=True)
    print("Duplicate rows removed successfully.")
else:
    print("No duplicate rows found.")

final_rows = df_clean.shape[0]
print(f"Number of rows after removing duplicates: {final_rows}")
object_cols = df_clean.select_dtypes(include='object').columns
for col in object_cols:
    unique_count = df_clean[col].nunique()
    print(f"Column '{col}': {unique_count} unique values (out of {len(df_clean)} rows)")
# Convert 'sex' to categorical type if it has a small number of unique values
df_clean['sex'] = df_clean['Sex'].astype('category')
missing_values = df_clean.isnull().sum()
print("Number of missing values in df_cleaned: ")
print(missing_values)
# Droping unessessary columns
df_cleaned = df_clean.drop(['Lower 95% uncertainty interval', 'Upper 95% uncertainty interval', 'ISO'], axis=1)
df_cleaned = df_cleaned.pivot_table(
    index=["Country/Region/World", "Year"],
    columns="Sex",
    values="Crude diabetes prevalence"
).reset_index()
df_cleaned.info()
df_cleaned.head()
df_cleaned.rename(columns={"Country/Region/World": "Country"}, inplace=True)
print("Column names updated. Displaying new column names:")
print(df_cleaned.columns)
output_file_path = '../../data/processed/cleaned-diabetes-prevalence.csv'
df_cleaned.to_csv(output_file_path, index=False)
print(f"Cleaned data saved to {output_file_path}")

Number of rows before removing duplicates: 13650
Number of duplicate rows found: 0
No duplicate rows found.
Number of rows after removing duplicates: 13650
Column 'Country/Region/World': 195 unique values (out of 13650 rows)
Column 'ISO': 195 unique values (out of 13650 rows)
Column 'Sex': 2 unique values (out of 13650 rows)
Number of missing values in df_cleaned: 
Country/Region/World              0
ISO                               0
Sex                               0
Year                              0
Crude diabetes prevalence         0
Lower 95% uncertainty interval    0
Upper 95% uncertainty interval    0
sex                               0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country/Region/World  6825 non-null   object 
 1   Year                  6825 non-null   int64  
 2   Men             

## 4. Inspect Categorical (Object) Columns

In [11]:
object_cols = df_clean.select_dtypes(include='object').columns

print("Categorical / object columns and their unique counts:")
for col in object_cols:
    unique_count = df_clean[col].nunique()
    print(f"Column '{col}': {unique_count} unique values (out of {len(df_clean)} rows)")

Categorical / object columns and their unique counts:
Column 'Country/Region/World': 195 unique values (out of 13650 rows)
Column 'ISO': 195 unique values (out of 13650 rows)
Column 'Sex': 2 unique values (out of 13650 rows)


## 5. Convert 'Sex' to Categorical Type

In [12]:
if 'Sex' in df_clean.columns:
    df_clean['Sex'] = df_clean['Sex'].astype('category')
    print("Converted 'Sex' column to categorical type.")
else:
    print("Column 'Sex' not found in df_clean.")

Converted 'Sex' column to categorical type.


## 6. Check Missing Values

In [13]:
missing_values = df_clean.isnull().sum()
print("Number of missing values in df_clean:")
print(missing_values)

Number of missing values in df_clean:
Country/Region/World              0
ISO                               0
Sex                               0
Year                              0
Crude diabetes prevalence         0
Lower 95% uncertainty interval    0
Upper 95% uncertainty interval    0
sex                               0
dtype: int64


## 7. Drop Unnecessary Columns

In [14]:
cols_to_drop = ['Lower 95% uncertainty interval', 'Upper 95% uncertainty interval', 'ISO']

df_cleaned = df_clean.drop(cols_to_drop, axis=1)

print("Dropped unnecessary columns:")
print(cols_to_drop)
print("Remaining columns:")
print(df_cleaned.columns.tolist())


Dropped unnecessary columns:
['Lower 95% uncertainty interval', 'Upper 95% uncertainty interval', 'ISO']
Remaining columns:
['Country/Region/World', 'Sex', 'Year', 'Crude diabetes prevalence', 'sex']


## 8. Reshape Data (Pivot Wider by Sex)

In [15]:
df_cleaned = df_cleaned.pivot_table(
    index=["Country/Region/World", "Year"],
    columns="Sex",
    values="Crude diabetes prevalence"
).reset_index()

print("Data reshaped using pivot (Country/Region/World + Year as index, Sex as columns).")
df_cleaned.info()
display(df_cleaned.head())

Data reshaped using pivot (Country/Region/World + Year as index, Sex as columns).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country/Region/World  6825 non-null   object 
 1   Year                  6825 non-null   int64  
 2   Men                   6825 non-null   float64
 3   Women                 6825 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 213.4+ KB


  df_cleaned = df_cleaned.pivot_table(


Sex,Country/Region/World,Year,Men,Women
0,Afghanistan,1980,0.028291,0.036917
1,Afghanistan,1981,0.028931,0.037609
2,Afghanistan,1982,0.029537,0.038284
3,Afghanistan,1983,0.030114,0.038955
4,Afghanistan,1984,0.030696,0.039642


## 9. Rename Columns Consistently

In [16]:
df_cleaned.rename(columns={"Country/Region/World": "Country"}, inplace=True)

print("Column names updated. New column names:")
print(df_cleaned.columns.tolist())

Column names updated. New column names:
['Country', 'Year', 'Men', 'Women']


## 10. Save Cleaned Data

In [17]:
output_file_path = "../../data/processed/cleaned-diabetes-prevalence.csv"
df_cleaned.to_csv(output_file_path, index=False)

print(f"Cleaned data saved to: {output_file_path}")

Cleaned data saved to: ../../data/processed/cleaned-diabetes-prevalence.csv
