In [6]:
#Import required packages

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [7]:

# Load the dataset into a Pandas dataframe
df_data = pd.read_csv('data/cleaned_data.csv')

In [4]:
# Function to check the distribution of the target variable
def data_distribution(df, target): 
    """
    This function calculates the percentage distribution of each category in the target variable.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target (str): The name of the target column.

    Returns:
    tuple: A tuple containing the percentage of 'Graduate', 'Dropout', and 'Enrolled' in the target variable.
    """
    
    # Calculate the percentage of students who graduated
    graduated = round(len(df[df[target] == "Graduate"]) / len(df) * 100, 2)
    
    # Calculate the percentage of students who dropped out
    dropped = round(len(df[df[target] == "Dropout"]) / len(df) * 100, 2)
    
    # Calculate the percentage of students who are still enrolled
    enrolled = round(len(df[df[target] == "Enrolled"]) / len(df) * 100, 2)
    
    return graduated, dropped, enrolled

"""
This step calls the data_distribution function, passing the DataFrame and the name of the target column.
It calculates and returns the percentage distribution of 'Graduate', 'Dropout', and 'Enrolled' categories
in the target variable. This is useful to understand the class imbalance in the dataset, which can affect
the performance of machine learning models.
"""

# Example usage of the function
data_distribution(df_data, "Target")


(49.93, 32.12, 17.95)

### Feature Engineering

In [5]:
# Step 13: Create New Features
df_data['age_admission_ratio'] = df_data['Age_at_enrollment'] / df_data['Admission_grade']

In [41]:
# Step 14: Normalize/Standardize Data
scaler = StandardScaler()
df_data[['Admission_grade', 'Age_at_enrollment', 'age_admission_ratio']] = scaler.fit_transform(df_data[['Admission_grade', 'Age_at_enrollment', 'age_admission_ratio']])

In [43]:
# Encode the 'Target' column into numerical values
label_encoder = LabelEncoder()
df_data['Target_encoded'] = label_encoder.fit_transform(df_data['Target'])
"""
This step converts the categorical target variable (e.g., 'Graduate', 'Dropout', 'Enrolled') 
into numerical values, which are required for correlation analysis and machine learning models.
"""

# Compute the correlation matrix excluding the original 'Target' column
correlation_matrix = df_data.drop(columns=['Target']).corr()

### Normalise quantitative columns having high skewness

In [8]:
# Updated list of quantitative columns based on renamed columns
quantitative_cols = ['Curricular_units_1st_sem_credited', 'Curricular_units_1st_sem_enrolled', 'Curricular_units_1st_sem_evaluations',
                     'Curricular_units_1st_sem_approved', 'Curricular_units_1st_sem_grade', 'Curricular_units_1st_sem_without_evaluations',
                     'Curricular_units_2nd_sem_credited', 'Curricular_units_2nd_sem_enrolled', 'Curricular_units_2nd_sem_evaluations',
                     'Curricular_units_2nd_sem_approved', 'Curricular_units_2nd_sem_grade', 'Curricular_units_2nd_sem_without_evaluations',
                     'Age_at_enrollment', 'Inflation_rate', 'GDP', 'Unemployment_rate']


In [9]:

# Numerical Summaries for quantitative variables
summary_df = df_data[quantitative_cols].describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]
summary_df.loc['skew'] = df_data[quantitative_cols].skew()

summary_df

Unnamed: 0,Curricular_units_1st_sem_credited,Curricular_units_1st_sem_enrolled,Curricular_units_1st_sem_evaluations,Curricular_units_1st_sem_approved,Curricular_units_1st_sem_grade,Curricular_units_1st_sem_without_evaluations,Curricular_units_2nd_sem_credited,Curricular_units_2nd_sem_enrolled,Curricular_units_2nd_sem_evaluations,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_2nd_sem_without_evaluations,Age_at_enrollment,Inflation_rate,GDP,Unemployment_rate
mean,0.709991,6.27057,8.299051,4.7066,10.640822,0.137658,0.541817,6.232143,8.063291,4.435805,10.230206,0.150316,23.265145,1.228029,0.001969,11.566139
std,2.360507,2.480178,4.179106,3.094238,4.843663,0.69088,1.918546,2.195951,3.947951,3.014764,5.210808,0.753774,7.587816,1.382711,2.269935,2.66385
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,-0.8,-4.06,7.6
25%,0.0,5.0,6.0,3.0,11.0,0.0,0.0,5.0,6.0,2.0,10.75,0.0,19.0,0.3,-1.7,9.4
50%,0.0,6.0,8.0,5.0,12.285714,0.0,0.0,6.0,8.0,5.0,12.2,0.0,20.0,1.4,0.32,11.1
75%,0.0,7.0,10.0,6.0,13.4,0.0,0.0,7.0,10.0,6.0,13.333333,0.0,25.0,2.6,1.79,13.9
max,20.0,26.0,45.0,26.0,18.875,12.0,19.0,23.0,33.0,20.0,18.571429,12.0,70.0,3.7,3.51,16.2
skew,4.169049,1.619041,0.976637,0.766262,-1.568146,8.207403,4.63482,0.788114,0.336497,0.306279,-1.31365,7.267701,2.054988,0.252375,-0.394068,0.212051


In [16]:
def normalize_and_summarize(df, quantitative_cols):
    """
    Normalize columns with high skewness and generate numerical summaries for quantitative variables.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    quantitative_cols (list): List of quantitative column names.

    Returns:
    pd.DataFrame: A DataFrame containing the original and normalized summaries stacked vertically with a separator.
    """
    # Calculate summary statistics
    summary_df = df[quantitative_cols].describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    summary_df.loc['skew'] = df[quantitative_cols].skew()

    # Identify columns with high skewness
    high_skew_cols = summary_df.columns[(summary_df.loc['skew'] > 1) | (summary_df.loc['skew'] < -1)]

    # Duplicate the dataset
    df_normalized = df.copy()

    # Normalize data using log transformation
    df_normalized[high_skew_cols] = np.sqrt(df_normalized[high_skew_cols])

    # Alternatively, other normalization techniques can be used
    # df_normalized[high_skew_cols] = np.log(df_normalized[high_skew_cols])

    # Numerical summaries for normalized quantitative variables
    summary_df_normalized = df_normalized[quantitative_cols].describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    summary_df_normalized.loc['skew'] = df_normalized[quantitative_cols].skew()

    # Stack the two dataframes vertically with a separator
    separator = pd.DataFrame([['----'] * len(quantitative_cols)], columns=quantitative_cols)
    stacked_summary = pd.concat([summary_df, separator, summary_df_normalized], keys=['Original Summary', '----', 'Normalized Summary'])

    return stacked_summary

# Example usage
# Assuming summary_df and quantitative_cols are already defined
stacked_summary = normalize_and_summarize(df_data, quantitative_cols)
display(stacked_summary)

Unnamed: 0,Unnamed: 1,Curricular_units_1st_sem_credited,Curricular_units_1st_sem_enrolled,Curricular_units_1st_sem_evaluations,Curricular_units_1st_sem_approved,Curricular_units_1st_sem_grade,Curricular_units_1st_sem_without_evaluations,Curricular_units_2nd_sem_credited,Curricular_units_2nd_sem_enrolled,Curricular_units_2nd_sem_evaluations,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_2nd_sem_without_evaluations,Age_at_enrollment,Inflation_rate,GDP,Unemployment_rate
Original Summary,mean,0.709991,6.27057,8.299051,4.7066,10.640822,0.137658,0.541817,6.232143,8.063291,4.435805,10.230206,0.150316,23.265145,1.228029,0.001969,11.566139
Original Summary,std,2.360507,2.480178,4.179106,3.094238,4.843663,0.69088,1.918546,2.195951,3.947951,3.014764,5.210808,0.753774,7.587816,1.382711,2.269935,2.66385
Original Summary,min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,-0.8,-4.06,7.6
Original Summary,25%,0.0,5.0,6.0,3.0,11.0,0.0,0.0,5.0,6.0,2.0,10.75,0.0,19.0,0.3,-1.7,9.4
Original Summary,50%,0.0,6.0,8.0,5.0,12.285714,0.0,0.0,6.0,8.0,5.0,12.2,0.0,20.0,1.4,0.32,11.1
Original Summary,75%,0.0,7.0,10.0,6.0,13.4,0.0,0.0,7.0,10.0,6.0,13.333333,0.0,25.0,2.6,1.79,13.9
Original Summary,max,20.0,26.0,45.0,26.0,18.875,12.0,19.0,23.0,33.0,20.0,18.571429,12.0,70.0,3.7,3.51,16.2
Original Summary,skew,4.169049,1.619041,0.976637,0.766262,-1.568146,8.207403,4.63482,0.788114,0.336497,0.306279,-1.31365,7.267701,2.054988,0.252375,-0.394068,0.212051
----,0,----,----,----,----,----,----,----,----,----,----,----,----,----,----,----,----
Normalized Summary,mean,0.282992,2.427641,8.299051,4.7066,2.981449,0.0899,0.236649,6.232143,8.063291,4.435805,2.862609,0.091601,4.772031,1.228029,0.001969,11.566139


In [17]:
# Save the cleaned and prepared data
stacked_summary.to_csv('data/prep_data.csv', index=False)
print("Prepared for ML data has been saved as 'prep_data.csv'.")

Prepared for ML data has been saved as 'prep_data.csv'.
