# Data Preprocessing Steps

In [1]:

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from scipy.stats import zscore

# Load dataset
file_path = 'life_expectancy.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,Country Name,Country Code,Region,IncomeGroup,Year,Life Expectancy World Bank,Prevelance of Undernourishment,CO2,Health Expenditure %,Education Expenditure %,Unemployment,Corruption,Sanitation,Injuries,Communicable,NonCommunicable
0,Afghanistan,AFG,South Asia,Low income,2001,56.308,47.8,730.0,,,10.809,,,2179727.1,9689193.7,5795426.38
1,Angola,AGO,Sub-Saharan Africa,Lower middle income,2001,47.059,67.5,15960.0,4.483516,,4.004,,,1392080.71,11190210.53,2663516.34
2,Albania,ALB,Europe & Central Asia,Upper middle income,2001,74.288,4.9,3230.0,7.139524,3.4587,18.575001,,40.520895,117081.67,140894.78,532324.75
3,Andorra,AND,Europe & Central Asia,High income,2001,,,520.0,5.865939,,,,21.78866,1697.99,695.56,13636.64
4,United Arab Emirates,ARE,Middle East & North Africa,High income,2001,74.544,2.8,97200.0,2.48437,,2.493,,,144678.14,65271.91,481740.7


## Step 1: Identify Numerical and Categorical Columns

In [2]:

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)


Numerical Columns: ['Year', 'Life Expectancy World Bank', 'Prevelance of Undernourishment', 'CO2', 'Health Expenditure %', 'Education Expenditure %', 'Unemployment', 'Corruption', 'Sanitation', 'Injuries', 'Communicable', 'NonCommunicable']
Categorical Columns: ['Country Name', 'Country Code', 'Region', 'IncomeGroup']


## Step 2: Handle Missing Values

In [3]:

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

print("Missing values handled.")
df.head()


Missing values handled.


Unnamed: 0,Country Name,Country Code,Region,IncomeGroup,Year,Life Expectancy World Bank,Prevelance of Undernourishment,CO2,Health Expenditure %,Education Expenditure %,Unemployment,Corruption,Sanitation,Injuries,Communicable,NonCommunicable
0,Afghanistan,AFG,South Asia,Low income,2001.0,56.308,47.8,730.0,6.364059,4.589014,10.809,2.860513,52.738785,2179727.1,9689193.7,5795426.38
1,Angola,AGO,Sub-Saharan Africa,Lower middle income,2001.0,47.059,67.5,15960.0,4.483516,4.589014,4.004,2.860513,52.738785,1392080.71,11190210.53,2663516.34
2,Albania,ALB,Europe & Central Asia,Upper middle income,2001.0,74.288,4.9,3230.0,7.139524,3.4587,18.575001,2.860513,40.520895,117081.67,140894.78,532324.75
3,Andorra,AND,Europe & Central Asia,High income,2001.0,69.748362,10.663654,520.0,5.865939,4.589014,7.89076,2.860513,21.78866,1697.99,695.56,13636.64
4,United Arab Emirates,ARE,Middle East & North Africa,High income,2001.0,74.544,2.8,97200.0,2.48437,4.589014,2.493,2.860513,52.738785,144678.14,65271.91,481740.7


## Step 3: Identify and Remove Outliers

In [14]:
def cap_outliers(df, method='zscore', threshold=3):
    """
    Cap outliers to their respective min/max threshold based on the specified method.
    
    Parameters:
        df (pd.DataFrame): The dataset containing numerical columns.
        method (str): The method to identify outliers ('zscore' or 'iqr').
        threshold (float): The threshold for identifying outliers (default 3 for Z-score).
    
    Returns:
        pd.DataFrame: The dataframe with capped outliers.
    """
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    
    if method == 'zscore':
        # Calculate Z-scores for numerical columns
        z_scores = zscore(df[numerical_cols])
        print((z_scores > 3).any(axis=1).sum())
        for col, z in zip(numerical_cols, z_scores.T):
            # Compute thresholds
            lower_limit = df[col].mean() - threshold * df[col].std()
            upper_limit = df[col].mean() + threshold * df[col].std()
            # Cap outliers
            df[col] = np.clip(df[col], lower_limit, upper_limit)
    
    elif method == 'iqr':
        # Calculate IQR (Interquartile Range) for numerical columns
        for col in numerical_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_limit = Q1 - threshold * IQR
            upper_limit = Q3 + threshold * IQR
            # Cap outliers
            df[col] = np.clip(df[col], lower_limit, upper_limit)
    
    else:
        raise ValueError("Invalid method. Choose 'zscore' or 'iqr'.")
    
    return df

# Example usage
df_cleaned = cap_outliers(df, method='zscore', threshold=3)
print("Outliers capped at thresholds.")
df_cleaned.head()


492
Outliers capped at thresholds.


Unnamed: 0,Country Name,Country Code,Region,IncomeGroup,Year,Life Expectancy World Bank,Prevelance of Undernourishment,CO2,Health Expenditure %,Education Expenditure %,Unemployment,Corruption,Sanitation,Injuries,Communicable,NonCommunicable
0,Afghanistan,AFG,South Asia,Low income,2001.0,56.308,37.346893,730.0,6.364059,4.589014,10.809,2.860513,52.738785,2179727.1,9689193.7,5795426.38
1,Angola,AGO,Sub-Saharan Africa,Lower middle income,2001.0,47.059,37.346893,15960.0,4.483516,4.589014,4.004,2.860513,52.738785,1392080.71,11190210.53,2663516.34
2,Albania,ALB,Europe & Central Asia,Upper middle income,2001.0,74.288,4.9,3230.0,7.139524,3.4587,18.575001,2.860513,40.520895,117081.67,140894.78,532324.75
3,Andorra,AND,Europe & Central Asia,High income,2001.0,69.748362,10.663654,520.0,5.865939,4.589014,7.89076,2.860513,21.78866,1697.99,695.56,13636.64
4,United Arab Emirates,ARE,Middle East & North Africa,High income,2001.0,74.544,2.8,97200.0,2.48437,4.589014,2.493,2.860513,52.738785,144678.14,65271.91,481740.7


## Step 4: Encode Categorical Features

In [11]:

# One-hot encoding for categorical features
encoder = OneHotEncoder()
encoded_features = pd.DataFrame(encoder.fit_transform(df_cleaned[categorical_cols]).toarray(), 
                                columns=encoder.get_feature_names_out())

# Combine the encoded features with the original dataset
df_encoded = pd.concat([df_cleaned.drop(columns=categorical_cols), encoded_features], axis=1)
print("Categorical features encoded.")
df_encoded.head()


Categorical features encoded.


Unnamed: 0,Year,Life Expectancy World Bank,Prevelance of Undernourishment,CO2,Health Expenditure %,Education Expenditure %,Unemployment,Corruption,Sanitation,Injuries,...,Region_Europe & Central Asia,Region_Latin America & Caribbean,Region_Middle East & North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,IncomeGroup_High income,IncomeGroup_Low income,IncomeGroup_Lower middle income,IncomeGroup_Upper middle income
0,2001.0,56.308,40.814889,730.0,6.364059,4.589014,10.809,2.860513,52.738785,2179727.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,2001.0,47.059,40.814889,15960.0,4.483516,4.589014,4.004,2.860513,52.738785,1392080.71,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2001.0,74.288,4.9,3230.0,7.139524,3.4587,18.575001,2.860513,40.520895,117081.67,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2001.0,69.748362,10.663654,520.0,5.865939,4.589014,7.89076,2.860513,21.78866,1697.99,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2001.0,74.544,2.8,97200.0,2.48437,4.589014,2.493,2.860513,52.738785,144678.14,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Step 5: Scale Features

In [12]:

# Scale numerical features
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print("Features scaled.")
df_encoded.head()


Features scaled.


Unnamed: 0,Year,Life Expectancy World Bank,Prevelance of Undernourishment,CO2,Health Expenditure %,Education Expenditure %,Unemployment,Corruption,Sanitation,Injuries,...,Region_Europe & Central Asia,Region_Latin America & Caribbean,Region_Middle East & North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,IncomeGroup_High income,IncomeGroup_Low income,IncomeGroup_Lower middle income,IncomeGroup_Upper middle income
0,-1.643168,-1.471799,3.386751,-0.344193,0.014618,0.031768,0.528706,-0.001615,0.0,0.531623,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-1.643168,-2.484552,3.386751,-0.296055,-0.705128,0.031768,-0.66994,-0.001615,0.0,0.185247,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-1.643168,0.496987,-0.613774,-0.336292,0.311414,-0.724474,1.896624,-0.001615,-0.514011,-0.375447,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.643168,-9.7e-05,0.028234,-0.344857,-0.176029,0.031768,0.014681,-0.001615,-1.302083,-0.426188,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-1.643168,0.525019,-0.847691,-0.039272,-1.470268,0.031768,-0.93609,-0.001615,0.0,-0.363311,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Step 6: Apply PCA

In [13]:

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_encoded[numerical_cols])

# Add PCA components to the dataset
df_pca = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
df_final = pd.concat([df_encoded.drop(columns=numerical_cols), df_pca], axis=1)

print("PCA applied.")
df_final.head()


PCA applied.


Unnamed: 0,Country Name_Afghanistan,Country Name_Albania,Country Name_Algeria,Country Name_American Samoa,Country Name_Andorra,Country Name_Angola,Country Name_Antigua and Barbuda,Country Name_Argentina,Country Name_Armenia,Country Name_Australia,...,Region_Middle East & North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,IncomeGroup_High income,IncomeGroup_Low income,IncomeGroup_Lower middle income,IncomeGroup_Upper middle income,PC1,PC2
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.680432,-2.793234
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.720406,-3.667412
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.856724,-0.031983
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.632555,-0.969022
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.420789,-0.097566
