# Singapore University Acceptance Classification Model


In [1]:
import re

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load Data


In [2]:
df_2023 = pd.read_csv("data/2023-NUS.csv", header=1)
df_2022 = pd.read_csv("data/2022-NUS.csv", header=1)
df = pd.concat([df_2023, df_2022])
df.sample(10)

Unnamed: 0,Nationality,Education Track,Results (GPA),A-level Results,IB Score,University,Course Name,Results
102,International,JC,,88.875,,NUS,Computer Science,Accepted by direct entry/discovery day
55,International,JC,,83.75,,NUS,COLLEGE OF HUMANITIES AND SCIENCES,Accepted by direct entry/discovery day
159,Singaporean,JC,,72.5 (first choice),,NUS,Engineering,Rejected
36,Singaporean,JC,,90 + H3 pass,,NUS,BUSINESS ANALYTICS,Accepted by direct entry/discovery day
185,Singaporean,JC,,90,,NUS,MEDICINE,Accepted by direct entry/discovery day
59,Singaporean,JC,,83.75 RP + H3 Distinction,,NUS,COLLEGE OF HUMANITIES AND SCIENCES,Accepted by direct entry/discovery day
237,Singaporean,JC,,87.5,,NUS,Pharmacy,Accepted by direct entry/discovery day
128,Singaporean,JC,,87.75 + H3 Distinction,,NUS,Computer Science With A Minor In Mathematics +...,Accepted by direct entry/discovery day
190,Singaporean,Poly,3.78,,,NUS,Information Systems,Rejected
19,Singaporean,JC,,87.5,,NUS,Business Administration And Computer Science,Rejected


## Preprocessing


In [3]:
# Rename columns
df.columns = [
    "nationality",
    "education",
    "gpa",
    "alevel",
    "ibscore",
    "university",
    "course",
    "accepted",
]

# Change column data to lowercase.
columns_to_lower = ["nationality", "education", "course", "accepted"]
for column in columns_to_lower:
    df[column] = df[column].str.lower()

# Applying one-hot encoding to 'nationality' and 'education
df = pd.get_dummies(df, columns=["nationality", "education"])

# Encode results with 0 or 1.
df["accepted"] = df["accepted"].apply(lambda x: True if "accepted" in x else False)

# Drop 'university' for now because all NUS.
df = df.drop("university", axis=1)

# Also drop 'course' for now.
df = df.drop("course", axis=1)

# Remove alphabetical characters from alevel column.
pattern = r"\d{2}(\.?\d*)?"
df["alevel"] = df["alevel"].apply(
    lambda x: re.search(pattern, x).group() if isinstance(x, str) else np.nan
)

display(df)

Unnamed: 0,gpa,alevel,ibscore,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly
0,3.94,,,True,False,False,True,False,False,True
1,3.00,,,False,False,False,True,False,False,True
2,3.69,,,True,False,False,True,False,False,True
3,,76.25,,True,False,False,True,False,True,False
4,,78.75,,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...
221,,86.625,,True,False,False,True,False,True,False
222,3.54,,,True,False,False,True,False,False,True
223,,81,,True,False,False,True,False,True,False
224,,72.5,,True,False,False,True,False,True,False


### Scaling Grades


In [4]:
grade_columns = ["gpa", "alevel", "ibscore"]
grade_scalers = []

for column in grade_columns:
    scaler = StandardScaler()
    df[column] = scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))
    grade_scalers.append(scaler)

display(df[grade_columns])

Unnamed: 0,gpa,alevel,ibscore
0,0.869019,,
1,-4.436400,,
2,-0.541997,,
3,,-1.393636,
4,,-0.941540,
...,...,...,...
221,,0.482562,
222,-1.388606,,
223,,-0.534654,
224,,-2.071780,


### Merging Grades

After scaling `gpa`, `alevel`, `ibscore`, merge them into a single `grades` column.


In [5]:
df["grades"] = df["alevel"].fillna(df["gpa"]).fillna(df["ibscore"])
df = df.drop(grade_columns, axis=1)
display(df)

Unnamed: 0,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly,grades
0,True,False,False,True,False,False,True,0.869019
1,False,False,False,True,False,False,True,-4.436400
2,True,False,False,True,False,False,True,-0.541997
3,True,False,False,True,False,True,False,-1.393636
4,False,False,False,True,False,True,False,-0.941540
...,...,...,...,...,...,...,...,...
221,True,False,False,True,False,True,False,0.482562
222,True,False,False,True,False,False,True,-1.388606
223,True,False,False,True,False,True,False,-0.534654
224,True,False,False,True,False,True,False,-2.071780


### Splitting Data


In [6]:
X = df.drop("accepted", axis=1)
y = df["accepted"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(376, 7) (376,)
(95, 7) (95,)


## Classification Models

### Logistic Regression


In [7]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

print(f"Train Score: {accuracy_score(train_pred, y_train)}")
print(f"Test Score: {accuracy_score(test_pred, y_test)}")

Train Score: 0.8563829787234043
Test Score: 0.8526315789473684
