# Singapore University Acceptance Classification Model


In [1]:
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load Data

In [2]:
df = pd.read_csv("data/2023-NUS.csv", header=1)
df.sample(10)

Unnamed: 0,Nationality,Education Track,Results (GPA),A-level Results,IB Score,University,Course Name,Results
81,Singaporean,JC,,83.75,,NUS,College Of Humanities And Sciences (Quantitati...,Accepted by direct entry/discovery day
26,Singaporean,JC,,86.25,,NUS,Business Analytics,Accepted by direct entry/discovery day
70,Singaporean,JC,,77.5 + 2.5 (1st choice),,NUS,College Of Humanities And Sciences (Life Science),Rejected
186,Singaporean,JC,,87.5,,NUS,Information Security,Rejected
31,Singaporean,JC,,87.5,,NUS,Business Analytics,Accepted by direct entry/discovery day
117,Singaporean,JC,,87.5,,NUS,Computer Science,Rejected
231,Singaporean,JC,,88.75,,NUS,Pharmaceutical Sciences,Accepted by direct entry/discovery day
123,Singaporean,JC,,88.875,,NUS,Computer Science + Nusc,Accepted by direct entry/discovery day
116,International,JC,,87.5 + H3 Distinction,,NUS,Computer Science,Accepted by ABA
88,International,JC,,86.25,,NUS,Computer Engineering,Accepted by direct entry/discovery day


## Preprocessing

In [3]:
# Rename columns
df.columns = [
    "nationality",
    "education",
    "gpa",
    "alevel",
    "ibscore",
    "university",
    "course",
    "accepted",
]

# Change column data to lowercase.
columns_to_lower = ["nationality", "education", "course", "accepted"]
for column in columns_to_lower:
    df[column] = df[column].str.lower()

# Applying one-hot encoding to 'nationality' and 'education
df = pd.get_dummies(df, columns=["nationality", "education"])

# Encode results with 0 or 1.
df["accepted"] = df["accepted"].apply(lambda x: True if "accepted" in x else False)

# Drop 'university' for now because all NUS.
df = df.drop("university", axis=1)

# Also drop 'course' for now.
df = df.drop("course", axis=1)

# Remove alphabetical characters from alevel column.
pattern = r"\d{2}.?\d*"
df["alevel"] = df["alevel"].apply(
    lambda x: re.search(pattern, x).group() if isinstance(x, str) else np.nan
)

display(df)

Unnamed: 0,gpa,alevel,ibscore,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly
0,3.94,,,True,False,False,True,False,False,True
1,3.00,,,False,False,False,True,False,False,True
2,3.69,,,True,False,False,True,False,False,True
3,,76.25,,True,False,False,True,False,True,False
4,,78.75,,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...
240,,87.5,,False,False,False,True,False,True,False
241,,78.75,,True,False,False,True,False,True,False
242,,75,,True,False,False,True,False,True,False
243,,83.75,,True,False,False,True,False,True,False


### Scaling Grades

In [4]:
grade_columns = ["gpa", "alevel", "ibscore"]
grade_scalers = []

for column in grade_columns:
    scaler = StandardScaler()
    df[column] = scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))
    grade_scalers.append(scaler)

display(df[grade_columns])

Unnamed: 0,gpa,alevel,ibscore
0,0.791853,,
1,-3.972073,,
2,-0.475149,,
3,,-1.681346,
4,,-1.190893,
...,...,...,...
240,,0.525692,
241,,-1.190893,
242,,-1.926573,
243,,-0.209987,


### Merging Grades

After scaling `gpa`, `alevel`, `ibscore`, merge them into a single `grades` column.

In [5]:
df["grades"] = df["alevel"].fillna(df["gpa"]).fillna(df["ibscore"])
df = df.drop(grade_columns, axis=1)
display(df)

Unnamed: 0,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly,grades
0,True,False,False,True,False,False,True,0.791853
1,False,False,False,True,False,False,True,-3.972073
2,True,False,False,True,False,False,True,-0.475149
3,True,False,False,True,False,True,False,-1.681346
4,False,False,False,True,False,True,False,-1.190893
...,...,...,...,...,...,...,...,...
240,False,False,False,True,False,True,False,0.525692
241,True,False,False,True,False,True,False,-1.190893
242,True,False,False,True,False,True,False,-1.926573
243,True,False,False,True,False,True,False,-0.209987


### Splitting Data

In [6]:
X = df.drop("accepted", axis=1)
y = df["accepted"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(196, 7) (196,)
(49, 7) (49,)


## Classification Models

### Logistic Regression

In [7]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

print(f"Train Score: {accuracy_score(train_pred, y_train)}")
print(f"Test Score: {accuracy_score(test_pred, y_test)}")

Train Score: 0.8724489795918368
Test Score: 0.7959183673469388


In [8]:
print(clf.coef_)

[[ 0.17211598 -0.05522869 -0.11501834  0.50146528 -0.11658072 -0.38301561
   0.31491495]]
