# Singapore University Acceptance Classification Model


In [1]:
import re
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load Data


In [2]:
df = pd.DataFrame()
for file in Path("./data").iterdir():
    df = pd.concat([df, pd.read_csv(file, header=1)])
df.sample(10)

Unnamed: 0,Nationality,Education Track,Results (GPA),A-level Results,IB Score,University,Course Name,Results
23,Singaporean,Poly,3.69,,,NUS,BUSINESS ADMINISTRATION,Accepted by ABA
201,Singaporean,JC,,90,,NUS,PHARMACEUTICAL SCIENCE,Accepted by direct entry/discovery day
234,Singaporean,JC,,88.75 H3 distinction,,NTU,Medicine,Accepted by direct entry/discovery day
74,Singaporean,JC,,75.3,,NTU,Business With Minor In International Trading,Accepted by direct entry/discovery day
195,Singaporean,JC,,88.75,,NTU,Environmental Earth Systems Science And Public...,Accepted by direct entry/discovery day
66,Singaporean,JC,,78.75,,SMU,INFORMATION SYSTEMS WITH A SECOND MAJOR,Accepted by direct entry/discovery day
149,Singaporean,JC,,90,,SMU,Law,Accepted by direct entry/discovery day
183,Singaporean,JC,,80,,NTU,Economics With A Second Major In Business,Accepted by direct entry/discovery day
149,Singaporean,Poly,3.58,,,NUS,ENGINEERING,Accepted by direct entry/discovery day
47,Singaporean,JC,,78.75,,NUS,COLLEGE IF HUMANITIES AND SCIENCES (PREFFERED ...,Accepted by direct entry/discovery day


## Preprocessing

### Rename Columns

First, we rename all the columns and convert everything to lowercase strings.

In [3]:
def rename_columns(df):
    df.columns = [
        "nationality",
        "education",
        "gpa",
        "alevel",
        "ibscore",
        "university",
        "course",
        "accepted",
    ]

    # Change column data to lowercase.
    columns_to_lower = ["nationality", "education", "course", "accepted"]
    for column in columns_to_lower:
        df[column] = df[column].str.lower()
    return df


df = rename_columns(df)
display(df)

Unnamed: 0,nationality,education,gpa,alevel,ibscore,university,course,accepted
0,singaporean,poly,3.94,,,NUS,accountancy,accepted by direct entry/discovery day
1,singaporean,poly,3.0,,,NUS,applied ai and analytics,rejected
2,singaporean,poly,3.69,,,NUS,architecture,accepted by direct entry/discovery day
3,singaporean,jc,,76.25,,NUS,architecture,accepted by aba
4,singaporean,jc,,78.75,,NUS,bba,rejected
...,...,...,...,...,...,...,...,...
221,singaporean,jc,,86.625,,NUS,psychology with second major in management,accepted by direct entry/discovery day
222,singaporean,poly,3.54,,,NUS,real estate,accepted by aba
223,singaporean,jc,,81,,NUS,real estate,accepted by direct entry/discovery day
224,singaporean,jc,,72.5,,NUS,real estate,accepted by aba


### Apply One-Hot Encoding

Next, we apply one-hot encoding to `nationality`, `education`, and `university` fields. We also replace the `results` column with an `accepted` column which is our *target* label.

In [4]:
def one_hot_encoding(df):
    # Applying one-hot encoding to 'nationality' and 'education
    df = pd.get_dummies(df, columns=["nationality", "education", "university"])

    # Encode results with 0 or 1.
    df["accepted"] = df["accepted"].apply(lambda x: True if "accepted" in x else False)
    return df


df = one_hot_encoding(df)
display(df)

Unnamed: 0,gpa,alevel,ibscore,course,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly,university_NTU,university_NUS,university_SMU
0,3.94,,,accountancy,True,False,False,True,False,False,True,False,True,False
1,3.0,,,applied ai and analytics,False,False,False,True,False,False,True,False,True,False
2,3.69,,,architecture,True,False,False,True,False,False,True,False,True,False
3,,76.25,,architecture,True,False,False,True,False,True,False,False,True,False
4,,78.75,,bba,False,False,False,True,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,,86.625,,psychology with second major in management,True,False,False,True,False,True,False,False,True,False
222,3.54,,,real estate,True,False,False,True,False,False,True,False,True,False
223,,81,,real estate,True,False,False,True,False,True,False,False,True,False
224,,72.5,,real estate,True,False,False,True,False,True,False,False,True,False


### Process Grades

In [5]:
def process_grades(df):
    # Remove alphabetical characters from alevel column.
    grade_columns = ["gpa", "alevel", "ibscore"]

    def extract_match(x):
        pattern = r"\d+(\.?\d*)"
        if isinstance(x, str):
            match = re.search(pattern, x)
            if match:
                return match.group()
        elif isinstance(x, float) or isinstance(x, int):
            return x
        return np.nan

    for column in grade_columns:
        df[column] = df[column].apply(extract_match)

    # Scale grades.
    for column in grade_columns:
        scaler = StandardScaler()
        df[column] = scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))

    # Merge grades.
    df["grades"] = df["alevel"].fillna(df["gpa"]).fillna(df["ibscore"])
    df = df.drop(grade_columns, axis=1)
    return df


df = process_grades(df)
display(df)

Unnamed: 0,course,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly,university_NTU,university_NUS,university_SMU,grades
0,accountancy,True,False,False,True,False,False,True,False,True,False,1.077826
1,applied ai and analytics,False,False,False,True,False,False,True,False,True,False,-5.010917
2,architecture,True,False,False,True,False,False,True,False,True,False,-0.541520
3,architecture,True,False,False,True,False,True,False,False,True,False,-1.121802
4,bba,False,False,False,True,False,True,False,False,True,False,-0.683968
...,...,...,...,...,...,...,...,...,...,...,...,...
221,psychology with second major in management,True,False,False,True,False,True,False,False,True,False,0.695210
222,real estate,True,False,False,True,False,False,True,False,True,False,-1.513128
223,real estate,True,False,False,True,False,True,False,False,True,False,-0.289917
224,real estate,True,False,False,True,False,True,False,False,True,False,-1.778553


### Process Courses

I choose to divide courses into competitive and non-competitive. Competitive courses include: computer science/engineering, dentistry, law, and medicine.

In [6]:
def process_courses(df):
    pattern = r"computer|dentistry|law|medicine"
    df["competitive"] = df["course"].str.contains(pattern)
    df = df.drop("course", axis=1)
    return df


df = process_courses(df)
display(df)

Unnamed: 0,accepted,nationality_international,nationality_pr,nationality_singaporean,education_ib,education_jc,education_poly,university_NTU,university_NUS,university_SMU,grades,competitive
0,True,False,False,True,False,False,True,False,True,False,1.077826,False
1,False,False,False,True,False,False,True,False,True,False,-5.010917,False
2,True,False,False,True,False,False,True,False,True,False,-0.541520,False
3,True,False,False,True,False,True,False,False,True,False,-1.121802,False
4,False,False,False,True,False,True,False,False,True,False,-0.683968,False
...,...,...,...,...,...,...,...,...,...,...,...,...
221,True,False,False,True,False,True,False,False,True,False,0.695210,False
222,True,False,False,True,False,False,True,False,True,False,-1.513128,False
223,True,False,False,True,False,True,False,False,True,False,-0.289917,False
224,True,False,False,True,False,True,False,False,True,False,-1.778553,False


### Drop Rows

I also want to drop all rows that don't have grades.

In [7]:
df = df.dropna(subset=["grades"])

### Splitting Data


In [8]:
X = df.drop("accepted", axis=1)
y = df["accepted"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1036, 11) (1036,)
(260, 11) (260,)


## Classification Models

### Logistic Regression


In [9]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

print(f"Train Score: {accuracy_score(train_pred, y_train)}")
print(f"Test Score: {accuracy_score(test_pred, y_test)}")
print(f"Recall: {recall_score(y_test, test_pred)}")
print(f"Precision: {precision_score(y_test, test_pred)}")
print(f"F1 Score: {f1_score(y_test, test_pred)}")

Train Score: 0.9092664092664092
Test Score: 0.9384615384615385
Recall: 1.0
Precision: 0.9384615384615385
F1 Score: 0.9682539682539683
