In [34]:
#Import required packages

from ucimlrepo import fetch_ucirepo

import os
import pandas as pd
import pprint

from sklearn.preprocessing import LabelEncoder, StandardScaler


In [35]:

# Load the dataset into a Pandas dataframe
df_data = pd.read_csv('data/cleaned_data.csv')

In [37]:
# Function to check the distribution of the target variable
def data_distribution(df, target): 
    """
    This function calculates the percentage distribution of each category in the target variable.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target (str): The name of the target column.

    Returns:
    tuple: A tuple containing the percentage of 'Graduate', 'Dropout', and 'Enrolled' in the target variable.
    """
    
    # Calculate the percentage of students who graduated
    graduated = round(len(df[df[target] == "Graduate"]) / len(df) * 100, 2)
    
    # Calculate the percentage of students who dropped out
    dropped = round(len(df[df[target] == "Dropout"]) / len(df) * 100, 2)
    
    # Calculate the percentage of students who are still enrolled
    enrolled = round(len(df[df[target] == "Enrolled"]) / len(df) * 100, 2)
    
    return graduated, dropped, enrolled

"""
This step calls the data_distribution function, passing the DataFrame and the name of the target column.
It calculates and returns the percentage distribution of 'Graduate', 'Dropout', and 'Enrolled' categories
in the target variable. This is useful to understand the class imbalance in the dataset, which can affect
the performance of machine learning models.
"""

# Example usage of the function
data_distribution(df_data, "Target")


(49.93, 32.12, 17.95)

### Feature Engineering

In [40]:
# Step 13: Create New Features
df_data['age_admission_ratio'] = df_data['Age_at_enrollment'] / df_data['Admission_grade']

In [41]:
# Step 14: Normalize/Standardize Data
scaler = StandardScaler()
df_data[['Admission_grade', 'Age_at_enrollment', 'age_admission_ratio']] = scaler.fit_transform(df_data[['Admission_grade', 'Age_at_enrollment', 'age_admission_ratio']])

In [43]:
# Encode the 'Target' column into numerical values
label_encoder = LabelEncoder()
df_data['Target_encoded'] = label_encoder.fit_transform(df_data['Target'])
"""
This step converts the categorical target variable (e.g., 'Graduate', 'Dropout', 'Enrolled') 
into numerical values, which are required for correlation analysis and machine learning models.
"""

# Compute the correlation matrix excluding the original 'Target' column
correlation_matrix = df_data.drop(columns=['Target']).corr()

In [17]:
# Save the cleaned and prepared data
df_data.to_csv('data/prep_data.csv', index=False)
print("Prepared for ML data has been saved as 'prep_data.csv'.")

Prepared for ML data has been saved as 'prep_data.csv'.
