In [11]:
# ==================================
# 1. Import necessary libraries
# ==================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import sys

In [2]:
# ==================================
# 2. Load the data
# ==================================
file_path = "data/dropoutgraduate.csv"

# Read the CSV and clean up column names
df = pd.read_csv(file_path, sep=";")
df.columns = df.columns.str.strip() # Strip any leading/trailing whitespace from column names
print("‚úÖ Data columns:", df.columns.tolist())

‚úÖ Data columns: ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP', 'Targ

In [3]:
# ==================================
# 3. Define Features (X) and Target (y)
# ==================================
if "Target" not in df.columns:
    print("‚ùå Error: 'Target' column not found in the data. Please check the CSV file!")
    sys.exit(1)

X = df.drop("Target", axis=1)   # Features
y = df["Target"]                # Target

# Apply one-hot encoding if there are categorical features
X = pd.get_dummies(X, drop_first=True)

In [4]:
# ==================================
# 4. Split the data (80% train, 20% test)
# ==================================
# stratify=y ensures that the class distribution is the same in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
# ==================================
# 5. Feature Scaling (Standardization)
# ==================================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# ==================================
# 6. Train the Logistic Regression Model
# ==================================
# model = LogisticRegression(max_iter=1000, multi_class="multinomial")
model = LogisticRegression(max_iter=1000, multi_class="multinomial", class_weight="balanced")
model.fit(X_train, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [12]:
# ==================================
# 7. Make Predictions on the Test Set
# ==================================
y_pred = model.predict(X_test)

# ==================================
# 8. Evaluate the Model
# ==================================
print("\nüìä Model Evaluation Results")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall   :", recall_score(y_test, y_pred, average="weighted"))
print("F1 Score :", f1_score(y_test, y_pred, average="weighted"))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))


üìä Model Evaluation Results
Accuracy : 0.9049586776859504
Precision: 0.9046870582090331
Recall   : 0.9049586776859504
F1 Score : 0.9046042614333164

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.86      0.88       284
           1       0.91      0.93      0.92       442

    accuracy                           0.90       726
   macro avg       0.90      0.90      0.90       726
weighted avg       0.90      0.90      0.90       726


Confusion matrix:
 [[244  40]
 [ 29 413]]
