# Singapore University Acceptance Classification Model


In [1]:
import re

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load Data


In [2]:
df = pd.read_csv("data/2023-NUS.csv", header=1)
df.sample(10)

Unnamed: 0,Nationality,Education Track,Results (GPA),A-level Results,IB Score,University,Course Name,Results
113,Singaporean,Poly,3.94,,,NUS,Computer Science,Accepted by direct entry/discovery day
75,Singaporean,JC,,70,,NUS,College Of Humanities And Sciences (Mt-Related),Accepted by direct entry/discovery day
37,PR,JC,,87.5,,NUS,Business With Psychology (Double Major),Accepted by direct entry/discovery day
150,Singaporean,JC,,75.375 + 2.5 (fcbp),,NUS,Economics (Second Major In Mtl),Accepted by ABA
230,Singaporean,JC,,88.875 + H3 Merit,,NUS,Pharmaceutical Science,Accepted by direct entry/discovery day
38,Singaporean,IB,,,44.0,NUS,Chemical Engineering (Industry 4.0 Specialisat...,Accepted by direct entry/discovery day
168,Singaporean,JC,,87.5,,NUS,Engineering With Second Major In Management An...,Accepted by direct entry/discovery day
144,Singaporean,JC,,88.75 + H3 Merit,,NUS,Dentistry,Accepted by direct entry/discovery day
22,Singaporean,JC,,87.5 (First choice) + H3 Merit,,NUS,Business Administration With Second Major In P...,Accepted by direct entry/discovery day
94,PR,JC,,78.75,,NUS,Computer Engineering,Accepted by direct entry/discovery day


## Data Preprocessing

### Data Overview

- **Features**
  - Nationality
  - Education
  - Results (GPA / A-Level / IB)
  - Course
- **Target**: Results

### Preprocessing Steps

1. One-hot-encoding nationality.
2. One-hot-encoding education.
3. Grades???
4. Categorize course?
5. Convert results to 0 / 1.


In [3]:
# Rename columns
df.columns = ['nationality', 'education', 'gpa', 'alevel', 'ibscore', 'university', 'course', 'accepted']

# Change column data to lowercase.
columns_to_lower = ['nationality', 'education', 'course', 'accepted']
for column in columns_to_lower:
    df[column] = df[column].str.lower()

# Applying one-hot encoding to 'nationality' and 'education
df = pd.get_dummies(df, columns=['nationality', 'education'])

# Encode results with 0 or 1.
df['accepted'] = df['accepted'].apply(lambda x: True if 'accepted' in x else False)

# Drop 'university' for now because all NUS.
df = df.drop('university', axis=1)

# Remove alphabetical characters from alevel column.
pattern = r'\b\d+(\.\d+)?\b'
df['alevel'] = df['alevel'].apply(lambda x: x if  re.sub(pattern, '', str(x)))

display(df)

Unnamed: 0,gpa,alevel,ibscore,course,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly
0,3.94,,,accountancy,True,False,False,True,False,False,True
1,3.00,,,applied ai and analytics,False,False,False,True,False,False,True
2,3.69,,,architecture,True,False,False,True,False,False,True
3,,,,architecture,True,False,False,True,False,True,False
4,,,,bba,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
240,,,,pharmacy,False,False,False,True,False,True,False
241,,,,real estate,True,False,False,True,False,True,False
242,,,,real estate,True,False,False,True,False,True,False
243,,,,real estate,True,False,False,True,False,True,False


### Scaling Grades

In [4]:
grade_columns = ['gpa', 'alevel', 'ibscore']
grade_scalers = []

for column in grade_columns:
    scaler = StandardScaler()
    df[column] = scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))
    grade_scalers.append(scaler)

ValueError: could not convert string to float: ''