In [145]:
#Import required packages

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [146]:

# Load the dataset into a Pandas dataframe
df_data = pd.read_csv('data/cleaned_data.csv')

In [147]:
# Function to check the distribution of the target variable
def data_distribution(df, target): 
    """
    This function calculates the percentage distribution of each category in the target variable.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target (str): The name of the target column.

    Returns:
    tuple: A tuple containing the percentage of 'Graduate', 'Dropout', and 'Enrolled' in the target variable.
    """
    
    # Calculate the percentage of students who graduated
    graduated = round(len(df[df[target] == "Graduate"]) / len(df) * 100, 2)
    
    # Calculate the percentage of students who dropped out
    dropped = round(len(df[df[target] == "Dropout"]) / len(df) * 100, 2)
    
    # Calculate the percentage of students who are still enrolled
    enrolled = round(len(df[df[target] == "Enrolled"]) / len(df) * 100, 2)
    
    return graduated, dropped, enrolled

"""
This step calls the data_distribution function, passing the DataFrame and the name of the target column.
It calculates and returns the percentage distribution of 'Graduate', 'Dropout', and 'Enrolled' categories
in the target variable. This is useful to understand the class imbalance in the dataset, which can affect
the performance of machine learning models.
"""

# Example usage of the function
data_distribution(df_data, "Target")


(49.93, 32.12, 17.95)

### Feature Engineering

In [148]:
# Create New Features
df_data['age_admission_ratio'] = df_data['Age_at_enrollment'] / df_data['Admission_grade']

In [149]:
# Normalize/Standardize Data
scaler = StandardScaler()
df_data[['Admission_grade', 'Age_at_enrollment', 'age_admission_ratio']] = scaler.fit_transform(df_data[['Admission_grade', 'Age_at_enrollment', 'age_admission_ratio']])

In [150]:
"""
This step converts the categorical target variable (e.g., 'Graduate', 'Dropout', 'Enrolled')
into numerical values, which are required for correlation analysis and machine learning models.
"""

# Drop 'Nationality' and 'International' columns if needed
df_data = df_data.drop(columns=['Nationality', 'International'])

# Remove 'Enrolled' from the Target
df_data = df_data[df_data['Target'] != 'Enrolled']

# Encode the 'Target' column into numerical values
label_encoder = LabelEncoder()
df_data['Target_encoded'] = label_encoder.fit_transform(df_data['Target'])

# Drop the original 'Target' column
df_data = df_data.drop(columns=['Target'])

display(df_data.head())

Unnamed: 0,Marital_Status,Application_mode,Application_order,Course,Daytime_evening_attendance,Previous_qualification,Previous_qualification_(grade),Mothers_qualification,Fathers_qualification,Mothers_occupation,...,Curricular_units_2nd_sem_enrolled,Curricular_units_2nd_sem_evaluations,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_2nd_sem_without_evaluations,Unemployment_rate,Inflation_rate,GDP,age_admission_ratio,Target_encoded
0,1,17,5,171,1,1,122.0,19,12,5,...,0,0,0,0.0,0,10.8,1.4,1.74,-0.432698,0
1,1,15,1,9254,1,1,160.0,1,3,3,...,6,6,6,13.666667,0,13.9,-0.3,0.79,-0.788651,1
2,1,1,5,9070,1,1,122.0,37,37,9,...,6,0,0,0.0,0,10.8,1.4,1.74,-0.505542,0
3,1,17,2,9773,1,1,122.0,38,37,5,...,6,10,5,12.4,0,9.4,-0.8,-3.12,-0.281266,1
4,2,39,1,8014,0,1,100.0,37,38,9,...,6,6,6,13.0,0,13.9,-0.3,0.79,1.976349,1


### Normalise quantitative columns having high skewness

In [152]:
# Updated list of quantitative columns based on renamed columns
quantitative_cols = ['Curricular_units_1st_sem_credited', 'Curricular_units_1st_sem_enrolled', 'Curricular_units_1st_sem_evaluations',
                     'Curricular_units_1st_sem_approved', 'Curricular_units_1st_sem_grade', 'Curricular_units_1st_sem_without_evaluations',
                     'Curricular_units_2nd_sem_credited', 'Curricular_units_2nd_sem_enrolled', 'Curricular_units_2nd_sem_evaluations',
                     'Curricular_units_2nd_sem_approved', 'Curricular_units_2nd_sem_grade', 'Curricular_units_2nd_sem_without_evaluations',
                     'Age_at_enrollment', 'Inflation_rate', 'GDP', 'Unemployment_rate']


In [153]:
def normalize_and_summarize(df, quantitative_cols):
    """
    Normalize columns with high skewness and generate numerical summaries for quantitative variables.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    quantitative_cols (list): List of quantitative column names.

    Returns:
    pd.DataFrame: A DataFrame containing the original and normalized summaries stacked vertically with a separator.
    """
    # Calculate summary statistics
    summary_df = df_data[quantitative_cols].describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    summary_df.loc['skew'] = df_data[quantitative_cols].skew()

    # Identify columns with high skewness
    high_skew_cols = summary_df.columns[(summary_df.loc['skew'] > 1) | (summary_df.loc['skew'] < -1)]

    # Duplicate the dataset
    df_normalized = df_data.copy()

    # Normalize data using log transformation
    df_normalized[high_skew_cols] = np.sqrt(df_normalized[high_skew_cols])

    # Alternatively, other normalization techniques can be used
    # df_normalized[high_skew_cols] = np.log(df_normalized[high_skew_cols])

    # Numerical summaries for normalized quantitative variables
    summary_df_normalized = df_normalized[quantitative_cols].describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    summary_df_normalized.loc['skew'] = df_normalized[quantitative_cols].skew()

    # Stack the two dataframes vertically with a separator
    separator = pd.DataFrame([['----'] * len(quantitative_cols)], columns=quantitative_cols)
    stacked_summary = pd.concat([summary_df, separator, summary_df_normalized], keys=['Original Summary', '----', 'Normalized Summary'])

    return df_normalized, stacked_summary, summary_df_normalized

# Example usage
# Assuming summary_df and quantitative_cols are already defined
df_normalized, stacked_summary, summary_df_normalized = normalize_and_summarize(df_data, quantitative_cols)

display(df_normalized)

len(df_normalized)

  result = func(self.values, **kwargs)


Unnamed: 0,Marital_Status,Application_mode,Application_order,Course,Daytime_evening_attendance,Previous_qualification,Previous_qualification_(grade),Mothers_qualification,Fathers_qualification,Mothers_occupation,...,Curricular_units_2nd_sem_enrolled,Curricular_units_2nd_sem_evaluations,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_2nd_sem_without_evaluations,Unemployment_rate,Inflation_rate,GDP,age_admission_ratio,Target_encoded
0,1,17,5,171,1,1,122.0,19,12,5,...,0,0,0,0.000000,0.0,10.8,1.4,1.74,-0.432698,0
1,1,15,1,9254,1,1,160.0,1,3,3,...,6,6,6,3.696846,0.0,13.9,-0.3,0.79,-0.788651,1
2,1,1,5,9070,1,1,122.0,37,37,9,...,6,0,0,0.000000,0.0,10.8,1.4,1.74,-0.505542,0
3,1,17,2,9773,1,1,122.0,38,37,5,...,6,10,5,3.521363,0.0,9.4,-0.8,-3.12,-0.281266,1
4,2,39,1,8014,0,1,100.0,37,38,9,...,6,6,6,3.605551,0.0,13.9,-0.3,0.79,1.976349,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,5,...,6,8,5,3.559026,0.0,15.5,2.8,-4.06,-0.457047,1
4420,1,1,2,9773,1,1,120.0,1,1,9,...,6,6,2,3.316625,0.0,11.1,0.6,2.02,-0.520260,0
4421,1,1,1,9500,1,1,154.0,37,37,9,...,8,9,1,3.674235,0.0,13.9,-0.3,0.79,0.219443,0
4422,1,1,1,9147,1,1,180.0,37,37,7,...,5,6,5,3.464102,0.0,9.4,-0.8,-3.12,-0.837971,1


3630

In [154]:
# Save the cleaned and prepared data
df_normalized.to_csv('data/prep_data.csv', index=False)
print("Prepared for ML data has been saved as 'prep_data.csv'.")

Prepared for ML data has been saved as 'prep_data.csv'.
