In [1]:
# Load the uploaded final dataset for inspection and preparation
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import numpy as np

In [2]:
# Utility Functions
def drop_highly_correlated_features(df, threshold=0.9):
    """Remove features that are highly correlated with each other."""
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print(f"Dropping {len(to_drop)} highly correlated features: {to_drop}")
    return df.drop(columns=to_drop)

def remove_low_variance_features(df, threshold=0.01):
    """Remove features with very low variance."""
    selector = VarianceThreshold(threshold=threshold)
    numeric_df = df.select_dtypes(include=[np.number])
    filtered = selector.fit_transform(numeric_df)
    retained_columns = numeric_df.columns[selector.get_support()]
    print(f"Retained {len(retained_columns)} features after low-variance filter.")
    return pd.DataFrame(filtered, columns=retained_columns)

def scale_numeric_features(df):
    """Scale numeric features using StandardScaler."""
    scaler = StandardScaler()
    numeric_df = df.select_dtypes(include=[np.number])
    scaled = scaler.fit_transform(numeric_df)
    return pd.DataFrame(scaled, columns=numeric_df.columns)

In [5]:
# Load dataset
df = pd.read_csv("final.csv")
print("Original dataset shape:", df.shape)
print(df.info())

Original dataset shape: (9928, 22)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9928 entries, 0 to 9927
Data columns (total 22 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Country                                  9928 non-null   object 
 1   Year                                     9928 non-null   int64  
 2   Gender                                   9928 non-null   object 
 3   Life expectancy                          9928 non-null   float64
 4   Unemployment                             9928 non-null   float64
 5   Infant Mortality                         9928 non-null   float64
 6   GDP                                      9928 non-null   float64
 7   GNI                                      9928 non-null   float64
 8   Clean fuels and cooking technologies     9928 non-null   float64
 9   Per Capita                               9928 non-null   float64
 10  Mortality cau

In [6]:
# Drop non-numeric columns before transformation
df_numeric = df.drop(columns=["Country", "Gender"])


In [7]:

# Drop highly correlated features
df_pruned = drop_highly_correlated_features(df_numeric, threshold=0.9)


Dropping 3 highly correlated features: ['GNI', 'Measles Immunization', 'Rural population']


In [8]:

# Remove low-variance features
df_pruned = remove_low_variance_features(df_pruned, threshold=0.01)


Retained 17 features after low-variance filter.


In [9]:

# Scale numeric features
df_scaled = scale_numeric_features(df_pruned)

In [10]:
# Combine with original non-scaled metadata
df_final = pd.concat([
    df[["Country", "Year", "Gender"]].reset_index(drop=True),
    df_scaled
], axis=1)


In [12]:
# Just display the first few rows or save the file locally
print(df_final.head())

# OR save to CSV for further use
df_final.to_csv("modified.csv", index=False)


       Country  Year  Gender      Year  Life expectancy  Unemployment  \
0  Afghanistan  2000  Female -1.648134        -1.366900      0.908136   
1  Afghanistan  2001  Female -1.474562        -1.315723      0.908624   
2  Afghanistan  2002  Female -1.300989        -1.263686      0.990831   
3  Afghanistan  2003  Female -1.127417        -1.210144      0.969223   
4  Afghanistan  2004  Female -0.953845        -1.154774      0.940792   

   Infant Mortality       GDP  Clean fuels and cooking technologies  \
0          2.151070 -0.278547                             -1.466177   
1          2.060731 -0.278547                             -1.444533   
2          1.970391 -0.279875                             -1.417478   
3          1.868268 -0.279811                             -1.385013   
4          1.770073 -0.279713                             -1.357958   

   Per Capita  Mortality caused by road traffic injury  \
0   -0.629113                                -0.429232   
1   -0.629113     