# Predicting Student Stats in a College

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [None]:
# load dataset
path = 'data/Student.csv'
df = pd.read_csv(path, sep=';')

df

## Check for target variable balance

Imbalanced. **need to balance it later**

In [None]:
# visualize distribution of target variable 
# if the data is imbalanced, we should handle it
sns.countplot(y='Target', data=df)
plt.title('Distribution of target variable')
plt.show()

## Check for null values & Correlation between features

In [None]:
print(df.isnull().sum())
print('===============================')
print(df.info())

In [None]:
# check correlation between features

# temporarily encode categorical features to check correlation
df_temp = df.drop(columns=['Target']).copy()
for col in df_temp.select_dtypes(include=['object']):
    df_temp[col] = LabelEncoder().fit_transform(df_temp[col])

# calculate info gain
importances = mutual_info_classif(df_temp, LabelEncoder().fit_transform(df['Target']))

# display
feature_importances = pd.Series(importances, index=df_temp.columns)
plt.xlabel('Coefficient Value')
feature_importances.sort_values().plot(kind='barh', figsize=(10,8))

Features with high coefficient values are often a trap because they are too correlated to our target. The model will pratically only use them as a "God" feature and ignore the others. It will have a high accuracy but its not predicting, its just telling the present.

## Select features and target

In [None]:
# select features based on importance
features = [
    'Marital status',
    'Course',
    'Previous qualification',
    'Age at enrollment',
    'Scholarship holder',
    'Application mode',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'GDP']
target = 'Target'

# extract X and y
X = df[features]
y = df[target]

In [None]:
# help visualize the selected features
df[features].sample(5)

In [None]:
# separate categorical and numerical features - useful for encoding and scaling -.

# separate categorical features
cat_features = ['Marital status', 'Course', 'Previous qualification', 'Application mode', 'Scholarship holder']
X[cat_features] = X[cat_features].astype(str) # ensure categorical features are string type so we encode them properly

# automatically detect numerical features
num_features = [col for col in X.columns if col not in cat_features]

# Split and Feature Engineering

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Encode **Categorical** features

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')

# fit and transform training data
X_train_encoded = encoder.fit_transform(X_train[cat_features])
# transform test data
X_test_encoded = encoder.transform(X_test[cat_features])

## Encode target (optional in DT)

In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


## Scale Numerical Features (NOT needed in DT)

## Concat Categorical and Numerical Features

In [None]:
# check whether the number of rows match (it has to in order to use hstack)
print(len(X_train) == len(X_train_encoded))

In [None]:
X_train_final = np.hstack([X_train[num_features], X_train_encoded])
X_test_final = np.hstack([X_test[num_features], X_test_encoded])

In [None]:
print(f"Final shape of training data: {X_train_final.shape}")

# Train our model

In [None]:
decisionTree = DecisionTreeClassifier(criterion='entropy' ,class_weight='balanced', max_depth=4, random_state=42)

decisionTree.fit(X_train_final, y_train_encoded)

# make predictions
tree_pred = decisionTree.predict(X_test_final)

In [None]:
# visualize accuracy
print("Decision Trees's Accuracy: ", metrics.accuracy_score(y_test_encoded, tree_pred))

## Classification report

In [None]:

# visualize tree
plot_tree(decisionTree)
plt.show()
