In [55]:
import numpy as np, pandas as pd, tensorflow as tf
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler

In [56]:
date_column = ['INTAKE TERM CODE', 'ADMIT TERM CODE', 'EXPECTED GRAD TERM CODE']
drop_column = ['ID 2', 'RECORD COUNT', 'PROGRAM LONG NAME', 'STUDENT TYPE NAME',
               'STUDENT TYPE GROUP NAME', 'PREV EDU CRED LEVEL NAME',
               'HS AVERAGE GRADE', 'PROGRAM SEMESTERS', 'TOTAL PROGRAM SEMESTERS',
               'RESIDENCY STATUS NAME', 'CURRENT STAY STATUS', 'APPL FIRST LANGUAGE DESC',
               'MAILING COUNTRY NAME', 'MAILING PROVINCE NAME', 'MAILING CITY NAME', 'MAILING POSTAL CODE']
data = pd.read_excel('HYPE dataset.xlsx', header=0).drop(columns=drop_column)

In [57]:
data['APPL EDUC INST TYPE NAME'] = data['APPL EDUC INST TYPE NAME'].fillna(0).replace('High School', 1)  # high school indicator
data.rename(columns={'APPL EDUC INST TYPE NAME': 'high school indicator', 'SUCCESS LEVEL': 'failure', 'APPLICANT CATEGORY NAME': 'effective academic history'}, inplace=True)
# column 'effective academic history' indicates history within certain years, it's ternary: no, high school, and post secondary
data['effective academic history'].replace({'Mature: Domestic 19 or older No Academic History': 'no', 'High School, Domestic': 'high school', 'BScN, High School Domestic': 'high school'}, inplace=True)
data['effective academic history'].replace(['Mature: Domestic  With Post Secondary', 'International Student, with Post Secondary'], 'post secondary', inplace=True)

In [58]:
# these columns contain illegal values, fill them with nan
data['GENDER'].replace('N', np.nan, inplace=True)
data['ACADEMIC PERFORMANCE'].replace('ZZ - Unknown', np.nan, inplace=True)
data['APPLICANT TARGET SEGMENT NAME'].replace('Unknown', np.nan, inplace=True)

In [59]:
international_postal = ['390', '682', '400', '143', '010']  # overseas zip codes
data['MAILING POSTAL CODE GROUP 3'].replace(international_postal, 'overseas', inplace=True)
data['failure'].replace(['In Progress', 'Successful', 'Unsuccessful'], [0, 0, 1], inplace=True)  # take 'in progress' and 'successful' as not failed
data['HS AVERAGE MARKS'][data['high school indicator'] == 0] = 0  # no mark for those who didn't attend high school

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [60]:
# categorical to numerical
encoder = preprocessing.LabelEncoder()
for column in data.columns:
	data[column] = pd.Series(encoder.fit_transform(data[column][data[column].notna()]), index=data[column][data[column].notna()].index)

In [61]:
# impute missing values
data[['MAILING POSTAL CODE GROUP 3', 'FIRST GENERATION IND']] = KNNImputer().fit_transform(data[['MAILING POSTAL CODE GROUP 3', 'FIRST GENERATION IND']])
data[['APPLICANT TARGET SEGMENT NAME', 'MAILING POSTAL CODE GROUP 3', 'AGE GROUP LONG NAME']] = KNNImputer().fit_transform(data[['APPLICANT TARGET SEGMENT NAME', 'MAILING POSTAL CODE GROUP 3', 'AGE GROUP LONG NAME']])
data[['MAILING POSTAL CODE GROUP 3', 'ENGLISH TEST SCORE']] = KNNImputer().fit_transform(data[['MAILING POSTAL CODE GROUP 3', 'ENGLISH TEST SCORE']])
data[['high school indicator', 'HS AVERAGE MARKS']] = KNNImputer().fit_transform(data[['high school indicator', 'HS AVERAGE MARKS']])
data[['failure', 'ACADEMIC PERFORMANCE']] = KNNImputer().fit_transform(data[['failure', 'ACADEMIC PERFORMANCE']])
data[['PRIMARY PROGRAM CODE', 'GENDER']] = KNNImputer().fit_transform(data[['PRIMARY PROGRAM CODE', 'GENDER']])
data['GENDER'] = np.where(data['GENDER'] > .5, 1, 0)
data['FIRST GENERATION IND'] = np.where(data['FIRST GENERATION IND'] > .5, 1, 0)

In [62]:
reduced_feature = IncrementalPCA(n_components=1).fit_transform(data[['FUNDING SOURCE NAME', 'TIME STATUS NAME']])  # FUNDING SOURCE NAME is highly correlated with TIME STATUS NAME
data.insert(6, 'time and fund', reduced_feature)
data.drop(columns=['FUNDING SOURCE NAME', 'TIME STATUS NAME', 'high school indicator'], inplace=True)

In [63]:
label = data['failure']
data.drop(columns='failure', inplace=True)
data = StandardScaler().fit_transform(data)
dataset = tf.data.Dataset.from_tensor_slices((data, label))

In [64]:
for x, y in dataset:
	print(x, y)

tf.Tensor(
[-2.59119833 -2.56763857  1.08475758  1.42801231 -1.87821761  0.16159725
 -0.25948155  1.25681228 -0.67243878 -0.57965507 -1.27743473  0.69795953
 -2.48968351 -1.3267745   1.31491914 -0.01024246 -0.05945368  1.055426
 -1.19551619 -2.46835387], shape=(20,), dtype=float64) tf.Tensor(1.0, shape=(), dtype=float64)
tf.Tensor(
[-2.59119833 -2.56763857  1.08475758  1.69508225  0.12815131  0.16159725
 -0.25948155  0.84142517 -0.67243878 -0.57965507  0.19178213 -0.04288311
 -2.38277244  0.75370758  2.50723191  1.35815027 -0.85694487  1.055426
  1.33638912  0.43712698], shape=(20,), dtype=float64) tf.Tensor(0.0, shape=(), dtype=float64)
tf.Tensor(
[-2.48170877 -2.45484955 -0.24971399  0.09266257 -1.20942797  0.16159725
  2.04074744 -0.1673721   1.48712423 -0.57965507 -1.27743473  1.19185462
 -1.95512816 -1.3267745  -0.79872624 -0.01024246 -0.85694487 -0.94748471
  0.07043646  0.43712698], shape=(20,), dtype=float64) tf.Tensor(1.0, shape=(), dtype=float64)
tf.Tensor(
[-2.37221921 -2.34