In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math

# Extract Features

In [4]:
# Load training data
train_filepath = "train.csv"
traindf = pd.read_csv(train_filepath, sep='\t')
print("Table:", np.shape(traindf))
print("Columns:", traindf.columns)

Table: (10313, 9)
Columns: Index(['pid', 'toefl', 'gre', 'undergra_major', 'undergra_grade', 'school',
       'degree', 'major', 'apply_result'],
      dtype='object')


In [11]:
def lower(row, col):
    return row[col].lower()
traindf["undergra_major"] = traindf.apply(lower, axis=1, args=("undergra_major",))
traindf["major"] = traindf.apply(lower, axis=1, args=("major",))

In [164]:
traindf.head(200)

Unnamed: 0,pid,toefl,gre,undergra_major,undergra_grade,school,degree,major,apply_result
0,2,103.0,324.0,spatial information and digital technology,3.930,University of Michigan Ann Arbor,ms,environmental informatic,ad
1,2,103.0,324.0,spatial information and digital technology,3.930,Boston University,ms,remote sensing,ad
2,2,103.0,324.0,spatial information and digital technology,3.930,SUNY University at Buffalo,mgis,geography,ad
3,2,103.0,324.0,spatial information and digital technology,3.930,University of Iowa,map,geography,ad
4,2,103.0,324.0,spatial information and digital technology,3.930,Clark University,mgis,geography,ad
5,3,100.0,319.0,philosophy,3.600,University of Chicago,ma,maph,ad
6,4,103.0,320.0,chemistry,3.784,University of Illinois at UrbanaChampaign,ba,chemistry,ad
7,5,100.0,325.0,metal materials science and engineering,3.300,University of Pennsylvania,ms,materials engineering,reject
8,6,107.0,324.0,english literature,3.600,University of Pennsylvania,ma,teaching english as a second language,ad
9,6,107.0,324.0,english literature,3.600,University of Washington,ma,public administration,ad


## One-hot Features

In [6]:
def one_dim_split(df, col, cata=None):
    dummies = pd.get_dummies(df[col], prefix=cata, sparse=True)
    print("Column %s has %d values, so we will add as many columns to this dataframe." % (col, dummies.shape[1]))
    return dummies

### One-dimentional Features

In [13]:
# School
school_features = one_dim_split(traindf, "school")
school_features.head()

Column school has 425 values, so we will add as many columns to this dataframe.


Unnamed: 0,Aarhus University,American University,Architectural Association AA School of Architecture,Arizona State University,Auburn University,Australian National University,Barcelona Graduate School of Economics,Baylor College of Medicine,Baylor University,Berlin Mathematical School,...,oklahoma state university,ucb,ucsd,uiuc,upenn,usc,utd,uva,vanderbilt,wustl
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Degree
degree_features = one_dim_split(traindf, "degree")
degree_features.head()

Column degree has 163 values, so we will add as many columns to this dataframe.


Unnamed: 0,16practicumoption,ba,bsc,cccp,crp,dual degree,econ 1 year,edm,education specialist,first year of fiveyear arbriba recognized course,...,pharmd,phd,professional master,professional master program,professional masters degree program,psyd,sais,sm2,smarchs,ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Tokenized Features

In [126]:
# Load stopwords
stopwords = {}
with open("stoplist.txt", "r") as f:
    stopwords = set([w.strip() for w in f])
print("%d stopwords in total." % len(stopwords))

420 stopwords in total.


In [127]:
def token(row, col):
    s = row[col]
    whitewords = '();,.-'
    for c in whitewords: s = s.replace(c, ' ')
    v0 = s.split()
    v = []
    for w in v0:
        if w not in stopwords:
            v.append(w)
    return v

def tokenized_split(df, col, cata=None):
    s = df.apply(token, axis=1, args=(col,))
    dummies = pd.get_dummies(s.apply(pd.Series), prefix=cata, sparse=True).sum(axis=1, level=0)
    print("Column %s has %d values, so we will add as many columns to this dataframe." % (col, dummies.shape[1]))
    return dummies

In [128]:
# Undergra Major
undergra_major_features = tokenized_split(traindf, "undergra_major", "um").astype(float)
undergra_major_features.head()

Column undergra_major has 443 values, so we will add as many columns to this dataframe.


Unnamed: 0,um_3/10,um_acca,um_accounting,um_ad,um_administration,um_advertising,um_ae,um_aerospace,um_agricultural,um_agronomy,...,um_screenwriter,um_studio,um_turn,um_actuarial,um_iowa,um_organized,um_fashion,um_state,um_retail,um_evaluation
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
# Major
major_features = tokenized_split(traindf, "major", "mj").astype(float)
major_features.head()

Column major has 1015 values, so we will add as many columns to this dataframe.


Unnamed: 0,mj_2year,mj_37,mj_a3sr,mj_aa,mj_accounting,mj_advanced,mj_advertising,mj_aem,mj_aeronautics,mj_aerospace,...,mj_mam,mj_option,mj_organisations,mj_pols,mj_reda,mj_rs,mj_states,mj_clasic,mj_gs,mj_certificate
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Hyper-dimentional Features

In [7]:
def join_columns(row, cols):
    return ",".join([str(row[col]) for col in cols])
def joint_name(cols):
    return ",".join(cols)

## Numerical Features

In [154]:
def norm_log(row, col):
    return math.log(row[col] + 1)
def norm_pro(row, col, max_val):
    return row[col] / max_val

In [155]:
# GPA
gpa_features = traindf.apply(norm_pro, axis=1, args=("undergra_grade", 4.0))

In [156]:
# TOEFL
tf_features = traindf.apply(norm_pro, axis=1, args=("toefl", 120))

In [157]:
# GRE
gre_features = traindf.apply(norm_pro, axis=1, args=("gre", 340))

In [158]:
numerical_features = pd.concat((gpa_features, tf_features, gre_features), axis=1)
numerical_features.head()

Unnamed: 0,0,1,2
0,0.9825,0.858333,0.952941
1,0.9825,0.858333,0.952941
2,0.9825,0.858333,0.952941
3,0.9825,0.858333,0.952941
4,0.9825,0.858333,0.952941


## Result Features

In [83]:
def result_split(row):
    r = row["apply_result"]
    if r == "reject": return 0
    elif r == "ad": return 1
    elif r == "offer": return 1
result_features = traindf.apply(result_split, axis=1)

## Aggregation and Split

In [159]:
from scipy import sparse
features = (school_features, degree_features
    , undergra_major_features, major_features
    , numerical_features
)
aggdf = sparse.hstack([f.to_sparse(fill_value=0).to_coo().tocsr() for f in features], format="csr", dtype=float)
print("Dataset scale: %d x %d" % (aggdf.shape[0], aggdf.shape[1]))

Dataset scale: 10313 x 2049


In [132]:
# Split into 4 parts for cross validation
K = 4
size = math.ceil(aggdf.shape[0] / 4)
Xs = []
Ys = []
for i in range(K):
    start = i * size
    end = (i + 1) * size
    if end > aggdf.shape[0]: end = aggdf.shape[0]
    Xs.append(aggdf[start : end])
    Ys.append(result_features[start : end])
    print("Partition %d size: %d" % (i, end - start))

Partition 0 size: 2579
Partition 1 size: 2579
Partition 2 size: 2579
Partition 3 size: 2576


# Machine Learning

In [133]:
# Root Mean Squared Error
def RMSE(P, Y):
    P = P[~np.isnan(Y)]
    Y = Y[~np.isnan(Y)]
    return np.sqrt(np.sum(np.square(P - Y)) / len(Y))

In [150]:
# Accuracy
def accuracy(P, Y):
    thresh = 0.5
    P = P[~np.isnan(Y)]
    P[np.where(P>=thresh)] = 1
    P[np.where(P<thresh)] = 0
    
    Y = Y[~np.isnan(Y)]
    
    return 1 - np.sum(np.abs(P - Y)) / len(Y)

In [151]:
def test_model(model):
    total_rmse = 0.0
    total_score = 0.0
    for i in range(K):
        X_ = Xs.pop(0); Xs.append(X_)
        Y_ = Ys.pop(0); Ys.append(Y_)
        
        X = sparse.vstack(Xs[0:3])
        Y = pd.concat(Ys[0:3], axis=0)
        
        model.fit(X, Y)
        P_ = model.predict_proba(X_)[:, 1]
        
        rmse = RMSE(P_, Y_)
        total_rmse += rmse
        
        score = accuracy(P_, Y_)
        total_score += score
        
        print("RMSE on partition %d: %f" % (i, rmse))
        print("Score on partition %d: %f" % (i, score))
        
    print("Average RMSE: %f" % (total_rmse / K))
    print("Average Score: %f" % (total_score / K))

## Logistic Classification

In [152]:
from sklearn import linear_model
lr_model = linear_model.LogisticRegression(solver="liblinear", n_jobs=-1, max_iter=1000) # Enable all CPUs
test_model(lr_model)

RMSE on partition 0: 0.406073
Score on partition 0: 0.770963
RMSE on partition 1: 0.396591
Score on partition 1: 0.777821
RMSE on partition 2: 0.403316
Score on partition 2: 0.772005
RMSE on partition 3: 0.411295
Score on partition 3: 0.764250
Average RMSE: 0.404319
Average Score: 0.771259


## Random Forest

In [153]:
from sklearn import ensemble
est_count = 100
rf_model = ensemble.RandomForestClassifier(n_estimators = est_count, criterion="entropy", max_depth=100, n_jobs=-1) # Enable all CPUs
test_model(rf_model)

RMSE on partition 0: 0.398626
Score on partition 0: 0.782997
RMSE on partition 1: 0.392456
Score on partition 1: 0.786351
RMSE on partition 2: 0.400775
Score on partition 2: 0.778209
RMSE on partition 3: 0.413810
Score on partition 3: 0.761535
Average RMSE: 0.401417
Average Score: 0.777273


## AdaBoost

In [124]:
from sklearn import ensemble
from sklearn import linear_model
ab_model = ensemble.AdaBoostClassifier(base_estimator=linear_model.LogisticRegression(solver="saga", n_jobs=-1, max_iter=1000))
test_model(ab_model)

RMSE on partition 0: 0.496459
RMSE on partition 1: 0.496674
RMSE on partition 2: 0.496529
RMSE on partition 3: 0.496448
Average RMSE: 0.496528
