In [None]:

import pandas as pd

# Load the data from the CSV file
student_data_df = pd.read_csv('datacleaned.csv', encoding='ISO-8859-1')


In [None]:

from scipy.stats import zscore
import numpy as np
import pandas as pd

def remove_outliers(df, threshold=3):
    """Removes outliers from a dataframe based on Z-scores."""
    z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
    no_outliers = (z_scores < threshold).all(axis=1)
    return df[no_outliers]
    

In [None]:

student_data_cleaned = remove_outliers(student_data_df)
    

In [None]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numerical_cols = student_data_cleaned.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = student_data_cleaned.select_dtypes(exclude=[np.number]).columns.tolist()
categorical_cols.remove('Target')

transformers = [
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols)
]

preprocessor = ColumnTransformer(transformers, remainder='passthrough')
X = student_data_cleaned.drop('Target', axis=1)
y = student_data_cleaned['Target']
X_transformed = preprocessor.fit_transform(X)
    

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
    

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
    

In [None]:

from sklearn.utils import resample

dropout = student_data_cleaned[student_data_cleaned['Target'] == 'Dropout']
enrolled = student_data_cleaned[student_data_cleaned['Target'] == 'Enrolled']
graduate = student_data_cleaned[student_data_cleaned['Target'] == 'Graduate']

dropout_upsampled = resample(dropout, replace=True, n_samples=len(graduate), random_state=42)
enrolled_upsampled = resample(enrolled, replace=True, n_samples=len(graduate), random_state=42)

upsampled = pd.concat([dropout_upsampled, enrolled_upsampled, graduate])

X_upsampled = upsampled.drop('Target', axis=1)
y_upsampled = upsampled['Target']

X_upsampled_transformed = preprocessor.transform(X_upsampled)
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled_transformed, y_upsampled, test_size=0.2, random_state=42)
    

In [None]:

logreg_up = LogisticRegression(max_iter=1000, random_state=42)
logreg_up.fit(X_train_up, y_train_up)
y_pred_up = logreg_up.predict(X_test_up)
    

In [None]:

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_up, y_train_up)
y_pred_rf = rf.predict(X_test_up)
    