# Singapore University Acceptance Classification Model


In [1]:
import re

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load Data

In [2]:
df = pd.read_csv("data/2023-NUS.csv", header=1)
df.sample(10)

Unnamed: 0,Nationality,Education Track,Results (GPA),A-level Results,IB Score,University,Course Name,Results
212,Singaporean,JC,,90 + h3 merit,,NUS,Medicine,Shortlisted for interview/written test but rej...
116,International,JC,,87.5 + H3 Distinction,,NUS,Computer Science,Accepted by ABA
50,Singaporean,JC,,88.75,,NUS,Chs Undeclared Major,Accepted by direct entry/discovery day
23,Singaporean,IB,,,43.0,NUS,Business Analytics,Accepted by direct entry/discovery day
165,PR,Poly,3.6319,,,NUS,Engineering With Preferred Major In Electrical,Accepted by direct entry/discovery day
177,Singaporean,Poly,3.73,,,NUS,Humanities And Sciences (Data Science And Anal...,Accepted by ABA
20,Singaporean,IB,,,42.0,NUS,Business Administration With Psychology Double...,Accepted by direct entry/discovery day
227,Singaporean,JC,,87.5,,NUS,Pharmaceutical,Rejected
40,Singaporean,JC,,88.75 +H3 Merit,,NUS,Chemistry,Accepted by direct entry/discovery day
139,PR,JC,,87.5,,NUS,Data Science And Economics,Shortlisted for interview/written test but rej...


## Preprocessing

In [3]:
# Rename columns
df.columns = ['nationality', 'education', 'gpa', 'alevel', 'ibscore', 'university', 'course', 'accepted']

# Change column data to lowercase.
columns_to_lower = ['nationality', 'education', 'course', 'accepted']
for column in columns_to_lower:
    df[column] = df[column].str.lower()

# Applying one-hot encoding to 'nationality' and 'education
df = pd.get_dummies(df, columns=['nationality', 'education'])

# Encode results with 0 or 1.
df['accepted'] = df['accepted'].apply(lambda x: True if 'accepted' in x else False)

# Drop 'university' for now because all NUS.
df = df.drop('university', axis=1)

# Remove alphabetical characters from alevel column.
pattern = r"\d{2}.?\d*"
df['alevel'] = df['alevel'].apply(lambda x: re.search(pattern, x).group() if isinstance(x, str) else np.nan)

display(df)

Unnamed: 0,gpa,alevel,ibscore,course,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly
0,3.94,,,accountancy,True,False,False,True,False,False,True
1,3.00,,,applied ai and analytics,False,False,False,True,False,False,True
2,3.69,,,architecture,True,False,False,True,False,False,True
3,,76.25,,architecture,True,False,False,True,False,True,False
4,,78.75,,bba,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
240,,87.5,,pharmacy,False,False,False,True,False,True,False
241,,78.75,,real estate,True,False,False,True,False,True,False
242,,75,,real estate,True,False,False,True,False,True,False
243,,83.75,,real estate,True,False,False,True,False,True,False


### Scaling Grades


In [4]:
grade_columns = ["gpa", "alevel", "ibscore"]
grade_scalers = []

for column in grade_columns:
    scaler = StandardScaler()
    df[column] = scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))
    grade_scalers.append(scaler)

display(df[grade_columns])

Unnamed: 0,gpa,alevel,ibscore
0,0.791853,,
1,-3.972073,,
2,-0.475149,,
3,,-1.681346,
4,,-1.190893,
...,...,...,...
240,,0.525692,
241,,-1.190893,
242,,-1.926573,
243,,-0.209987,
