In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

## Data processing

In [2]:
data = pd.read_csv('clean_data.csv')
features = data.drop(columns=['Unnamed: 0', 'univName', 'year'])
#features = features.drop(columns=['researchExp', 'industryExp', 'internExp','journalPubs','confPubs'])
#features = shuffle(features)
labels = features.pop('admit')
features.head()

Unnamed: 0,researchExp,industryExp,internExp,greV,greQ,journalPubs,confPubs,targetRank
0,0,18,5.0,160.0,167.0,0,0,91
1,0,66,0.0,146.0,157.0,0,0,91
2,0,0,0.0,148.0,161.0,0,0,91
3,0,0,0.0,150.0,161.0,0,0,91
4,0,0,0.0,147.0,156.0,0,0,91


In [3]:
# Use 20% test split (80% training + validation)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=1)

scaler = MinMaxScaler() #scale features between 0 and 1
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Models

In [4]:
# Linear regression model
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

# Logistic regression model
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

# random forest
RF = RandomForestClassifier(n_estimators=5, max_features=8, max_depth=10, random_state=1)
RF.fit(x_train, y_train)

# Gradient boosted decision tree
GBDT = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_features=8, max_depth=10, random_state=1)
GBDT.fit(x_train, y_train)

# support vector machine
SVM = svm.SVC()
SVM.fit(x_train, y_train)

# XGBoost
XGB = XGBClassifier(use_label_encoder=False)
XGB.fit(x_train, y_train)

y_test = np.array(y_test)



## Prediction and evaluations

In [5]:
def evaluate(y_pred, y_test):
    '''
    compute the accuracy, precision, and recall of a model.
    :y_pred: predicted labels (np.ndarray)
    :y_test: test labels (np.ndarray)
    '''
    assert type(y_pred) == np.ndarray and type(y_test) == np.ndarray
    accuracy = sum(y_pred == y_test)/len(y_test)
    precision = sum(y_pred+y_test==2)/sum(y_pred==1)
    recall = sum(y_pred+y_test==2)/sum(y_test==1)
    print('accuracy: {}'.format(accuracy))
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    return

In [6]:
print('======== Linear regression ========')
y_pred = lin_reg.predict(x_test) > 0.5
evaluate(y_pred, y_test)

print('======== Logistic regression ========')
y_pred = log_reg.predict(x_test) > 0.5
evaluate(y_pred, y_test)

print('======== Random forest ========')
y_pred = RF.predict(x_test)
evaluate(y_pred, y_test)

print('======== Gradient boosted decision tree ========')
y_pred = GBDT.predict(x_test)
evaluate(y_pred, y_test)

print('======== Support vector machine ========')
y_pred = SVM.predict(x_test)
evaluate(y_pred, y_test)

print('======== XGBoost ========')
y_pred = XGB.predict(x_test)
evaluate(y_pred, y_test)

accuracy: 0.603544061302682
precision: 0.6281211625051166
recall: 0.5692821368948247
accuracy: 0.6039272030651341
precision: 0.6277461350691619
recall: 0.5724355407160081
accuracy: 0.6781609195402298
precision: 0.7052759248029109
recall: 0.6471897607122983
accuracy: 0.6770114942528735
precision: 0.6939481268011527
recall: 0.6700055648302726
accuracy: 0.6085249042145594
precision: 0.6279937180997252
recall: 0.593396401409757
accuracy: 0.6818007662835249
precision: 0.697010093315559
recall: 0.6789092932665554
