In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import scipy as sp
from sklearn.metrics import r2_score
from sklearn.metrics import log_loss
from mlxtend.plotting import scatterplotmatrix
from mlxtend.plotting import heatmap
from mlxtend.plotting import plot_decision_regions
from sklearn.metrics import classification_report, confusion_matrix
import itertools

In [59]:
df = pd.read_csv('Admission_Predict.csv')

In [60]:
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit,Race,SES Percentage
0,1,337.0,118.0,4,4.5,4.5,9.65,1,0.92,Asian,12
1,2,324.0,107.0,4,4.0,4.5,8.87,1,0.76,latinx,11
2,3,316.0,104.0,3,3.0,3.5,8.00,1,0.72,latinx,78
3,4,322.0,110.0,3,3.5,2.5,8.67,1,0.80,white,77
4,5,314.0,103.0,2,2.0,3.0,8.21,0,0.65,african american,1
...,...,...,...,...,...,...,...,...,...,...,...
395,396,324.0,110.0,3,3.5,3.5,9.04,1,0.82,white,60
396,397,325.0,107.0,3,3.0,3.5,9.11,1,0.84,latinx,30
397,398,330.0,116.0,4,5.0,4.5,9.45,1,0.91,white,99
398,399,312.0,103.0,3,3.5,4.0,8.78,0,0.67,african american,12


In [61]:
columns = df.columns
columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit ', 'Race',
       'SES Percentage'],
      dtype='object')

In [62]:
for c in columns:
    print(df[[c]].isna().sum())

Serial No.    0
dtype: int64
GRE Score    4
dtype: int64
TOEFL Score    4
dtype: int64
University Rating    0
dtype: int64
SOP    1
dtype: int64
LOR     1
dtype: int64
CGPA    4
dtype: int64
Research    0
dtype: int64
Chance of Admit     2
dtype: int64
Race    29
dtype: int64
SES Percentage    0
dtype: int64


After looking through the data, there were not that many entries that had null values. Based on the size of the dataset,
we decided that the best approach would be to simply eliminate those samples from the dataset. Many of the NaN values
were in the race category. To be sure we were not disproportionately eliminating a particular race, we wanted to see the
ending proportions of race among all students were not terribly skewed. This analysis comes a little bit lower in the file.

In [63]:
df.dropna(0, inplace = True)

In [64]:
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit,Race,SES Percentage
0,1,337.0,118.0,4,4.5,4.5,9.65,1,0.92,Asian,12
1,2,324.0,107.0,4,4.0,4.5,8.87,1,0.76,latinx,11
2,3,316.0,104.0,3,3.0,3.5,8.00,1,0.72,latinx,78
3,4,322.0,110.0,3,3.5,2.5,8.67,1,0.80,white,77
4,5,314.0,103.0,2,2.0,3.0,8.21,0,0.65,african american,1
...,...,...,...,...,...,...,...,...,...,...,...
395,396,324.0,110.0,3,3.5,3.5,9.04,1,0.82,white,60
396,397,325.0,107.0,3,3.0,3.5,9.11,1,0.84,latinx,30
397,398,330.0,116.0,4,5.0,4.5,9.45,1,0.91,white,99
398,399,312.0,103.0,3,3.5,4.0,8.78,0,0.67,african american,12


We do not need to serial number category

In [65]:
df.drop(columns = 'Serial No.', inplace = True)


In [66]:
df.reset_index(inplace = True)

Instead of using an ordinal system for the race, we would rather use a one-hot encoded vector to represent the race of
the students to avoid the classifier putting a weight on ones race.

In [67]:
race = pd.get_dummies(df['Race'])

In [68]:
race

Unnamed: 0,Asian,african american,latinx,white
0,1,0,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,0,1
4,0,1,0,0
...,...,...,...,...
351,0,0,0,1
352,0,0,1,0
353,0,0,0,1
354,0,1,0,0


Since we have the dummies, we do not need the race column anymore.

In [69]:
df.drop(columns = 'Race', inplace = True)

We concatenate the DataFrame of races to the end of the data, essentially creating four new features

In [70]:
df = pd.concat([df,race], axis = 1)

In [71]:
df

Unnamed: 0,index,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit,SES Percentage,Asian,african american,latinx,white
0,0,337.0,118.0,4,4.5,4.5,9.65,1,0.92,12,1,0,0,0
1,1,324.0,107.0,4,4.0,4.5,8.87,1,0.76,11,0,0,1,0
2,2,316.0,104.0,3,3.0,3.5,8.00,1,0.72,78,0,0,1,0
3,3,322.0,110.0,3,3.5,2.5,8.67,1,0.80,77,0,0,0,1
4,4,314.0,103.0,2,2.0,3.0,8.21,0,0.65,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,395,324.0,110.0,3,3.5,3.5,9.04,1,0.82,60,0,0,0,1
352,396,325.0,107.0,3,3.0,3.5,9.11,1,0.84,30,0,0,1,0
353,397,330.0,116.0,4,5.0,4.5,9.45,1,0.91,99,0,0,0,1
354,398,312.0,103.0,3,3.5,4.0,8.78,0,0.67,12,0,1,0,0


Analyzing the proportions of each race

In [72]:
white_count = race["white"].sum()
latinx_count = race["latinx"].sum()
asian_count = race["Asian"].sum()
african_american_count = race["african american"].sum()

print("White count:", white_count)
print("LatinX count:", latinx_count)
print("Asian count:", asian_count)
print("African american count:", african_american_count)

White count: 185
LatinX count: 67
Asian count: 33
African american count: 71


Moving the y vector "Chance of Admit" to the end

In [73]:
df1 = df[["Chance of Admit "]]
df.drop(columns = 'Chance of Admit ',inplace = True)
df['Chance of Admit'] = df1

In [74]:
df.drop(columns = 'index', inplace = True)

Saving the new, cleaned data

In [75]:
df.to_csv('CleanedData.csv')