# Model1 - Amissions: Accept or Reject?

## Based on findings from feature selections work files: 
 * successful-student-profile/feature-selection-1.ipynb 
 * successful-student-profile/feature-selection-2-with-clustering.ipynb
 * successful-student-profile/feature-selection-3-with-ND-separated.ipynb
 * successful-student-profile/apps-goal-text-analysis-NB.ipynb

## Let's build our first model!

Output: % of graduation of each students
 * assumptions: our data visibility stops after student submit the application
       means we know when the cohort is open, close, start; when student apply, submit; but we don't know when we are going to accept/reject and notify the student, when the student going to put down payment

Approach: ND separated; application type separated

Methods: Logistics Regression / Decision Tree

Features:
 * user_age : account created with Udacity
 * cohort_open_to_notify
 * cohort_open_to_close
 * apply_before_start : days student apply before cohort start
 * apply_to_submit : days from apply to submit application
 * num_course_enrolled
 
 * education
 * employment
 * professional experience
 * python, java, c++, porbability, statistics, linear algebra, computer science, machine learning
 
 * mentioned programming skills / technology in goal
 
Added:
 * ND enrolled previously than applications
 * suspended / paused: -2; cancelled / trial ended: -1; no touch: 0; enrolled: 1; graduated/term finished: 2

In [1]:
import psycopg2
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn import linear_model,tree,svm,ensemble
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
import random
import re
import matplotlib.pyplot as plt
%matplotlib inline

  from pandas.core import datetools


In [9]:
laurel = open('conn_laurel.txt', 'r')
hardy = open('conn_hardy.txt', 'r')
conn_laurel = psycopg2.connect(laurel.read())
conn_hardy = psycopg2.connect(hardy.read())

sql_apps = open('successful-student-profile-apps.sql', 'r')
sql_courses = open('successful-student-profile-courses.sql', 'r')
sql_nd_enrolls = open('successful-student-profile-nd-enrolls.sql','r')
sql_questions = open('successful-student-profile-questions.sql', 'r')

df_apps = pd.read_sql(sql_apps.read(),conn_laurel)
df_courses = pd.read_sql(sql_courses.read(),conn_hardy)
df_nd_enrolls = pd.read_sql(sql_nd_enrolls.read(),conn_laurel)
df_questions = pd.read_sql(sql_questions.read(),conn_laurel)

In [10]:
df_apps.groupby(['nd_key','application_status'])['application_id'].count()

nd_key              application_status
nd001-connect       paid                     1
nd002-connect       paid                    25
nd004-connect-reno  graduated                6
                    paid                    14
nd009-connect       paid                    36
nd013               graduated                8
                    paid                  2543
                    term completed         907
nd209               graduated                4
                    paid                  1181
                    term completed         211
nd889               paid                  1301
                    term completed          93
Name: application_id, dtype: int64

In [51]:
# geo / country -> dummies
country = pd.get_dummies(df_apps['applicant_country'])
df_app = pd.concat([df_apps,country],axis=1)

In [37]:
# ND previously than application
df_nd_enrolls = df_nd_enrolls.assign(status1 = 0)
df_nd_enrolls['status1'] = df_nd_enrolls['status'].map({'SUSPENDED':-2,'CANCELLED':-1,'GRADUATED':3,'PAUSED':-2,'ENROLLED':1,'TERM_COMPLETED':2})
df_nd_enroll = df_nd_enrolls.groupby(['application_id','nd_key'])['status1'].max().reset_index()
df_nd = df_nd_enroll[['application_id','nd_key','status1']].pivot(index='application_id', values='status1', columns='nd_key').fillna(0).reset_index()

In [52]:
# data
df = pd.merge(df_app,df_courses,on=['application_id','cohort_id','applicant_id','nd_key'],how='left')
df = pd.merge(df,df_nd,on=['application_id'],how='left').fillna(0)

In [58]:
df.head()

Unnamed: 0,application_id,cohort_id,applicant_id,nd_key,application_type,applicant_country,applicant_geo,application_status,user_age,cohort_open_to_notify,...,nd113,nd114,nd116,nd124,nd209,nd801,nd802,nd803,nd818,nd889
0,678d0f76-7952-42d2-ba94-bb1a10e63eec,47,9587280937,nd013,term,US,US,term completed,178,65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,da89c0a7-d45e-45cc-8c23-5f626f397494,283,u22041598,nd889,term,US,US,paid,11,71,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1edefceb-c1c1-4ef5-a5e4-02d20e6262c6,18,905959510,nd013,term,US,US,term completed,226,88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,b57ef22b-e4d4-4aea-a370-618ed6fb85b9,45,5771700393,nd209,term,US,US,term completed,213,64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,c873c045-29d0-4c40-a37a-2ede9b601e0f,47,10710257345,nd013,term,US,US,paid,165,65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
df.shape

(6330, 156)

In [75]:
df_apps.columns

Index(['application_id', 'cohort_id', 'applicant_id', 'nd_key',
       'application_type', 'applicant_country', 'applicant_geo',
       'application_status', 'user_age', 'cohort_open_to_notify',
       'cohort_open_to_close', 'cohort_open_month', 'application_month',
       'apply_before_start', 'apply_to_submit', 'submit_to_decision',
       'submit_to_notify', 'notify_to_pay', 'price'],
      dtype='object')

In [77]:
df_apps.shape

(6330, 19)

## Based on the pivot above, let's pick nd013 as pilot to model.

In [68]:
df_nd013 = df.query("nd_key == 'nd013'")
df_nd013 = df_nd013.assign(status = df_nd013['application_status'].map({'graduated':1,'term completed':1,'paid':0}))
print(df_nd013.shape)
df_nd013.columns

(3458, 157)


Index(['application_id', 'cohort_id', 'applicant_id', 'nd_key',
       'application_type', 'applicant_country', 'applicant_geo',
       'application_status', 'user_age', 'cohort_open_to_notify',
       ...
       'nd114', 'nd116', 'nd124', 'nd209', 'nd801', 'nd802', 'nd803', 'nd818',
       'nd889', 'status'],
      dtype='object', length=157)

In [69]:
df_nd013.groupby(['cohort_id','application_status'])['application_id'].count()

cohort_id  application_status
18         graduated               7
           paid                  609
           term completed        502
220        paid                  326
47         graduated               1
           paid                  465
           term completed        345
88         paid                  746
           term completed         55
89         paid                  397
           term completed          5
Name: application_id, dtype: int64

In [70]:
# among cohorts 18,47,88,89,220, cohort 18 and 47 are closed. cohort 88 is closing in Dec
df_nd013 = df_nd013.loc[(df_nd013['cohort_id']=='18') | (df_nd013['cohort_id']=='47')]
df_nd013.shape

(1929, 157)

In [101]:
# only columns we care about
#cols = ['status','application_id','user_age','cohort_open_to_notify','cohort_open_to_close'
#        ,'apply_before_start','apply_to_submit','num_course_enrolled']
cols = ['status','application_id','user_age', 'cohort_open_to_notify',
       'cohort_open_to_close', 'cohort_open_month', 'application_month',
       'apply_before_start', 'apply_to_submit', 'num_courses',
       'num_course_finished', 'num_course_enrolled',
       'user_study_age'] + list(df['applicant_country'].unique()) + list(df_nd_enrolls['nd_key'].unique())
df_nd013 = df_nd013[cols]

In [102]:
# from questions_responses
# categorical
df1 = df_nd013
categories = {'education':'48e7b492-62b4-4d99-b596-80d68f2966ae'
             ,'employment':'fba3666b-db04-46e9-8f3d-2a303f13e0a5'
             ,'professional_experience':'6967091c-09c6-4455-9f1e-d0de318bacc5'
             ,'goal':'779c3b6c-3648-423b-8d3f-8a4f36f23e2a'}
for i,qr_id in categories.items():
    d = df_questions.copy()
    d[i] = 0
    d[i] = np.where(d['question_id']==qr_id,d['response'],'0')
    o_i = d.groupby('application_id').agg({i:'max'}).reset_index()
    o_i = o_i[o_i.iloc[:,1] != '0']
    o_i = o_i[o_i.iloc[:,1] != 'Other']
    #df1 = pd.merge(df1,o_i,on=['application_id'],how='inner')
    d_i = pd.get_dummies(o_i[i])
    df_i = pd.concat([o_i['application_id'],d_i],axis=1)
    df1 = pd.merge(df1,df_i,on='application_id',how='left')

In [103]:
# from questions_responses
# programming languages
data = df1
for i in (['python','java','cplus','probability','statistics','linear algebra','computer science','machine learning']):
    if i == 'cplus':
        j = 'c\+\+' 
    else:
        j = i
    d = df_questions.copy()
    d[i] = 0
    d[i] = np.where(d['response'].str.lower().str.contains(j),1,d[i])
    d[i] = np.where(d['question_prompt'].str.lower().str.contains(j)&d['response'].str.match('.*[1-9].*'),1,d[i])
    o_i = d.groupby('application_id').agg({i:'max'}).reset_index()
    data = pd.merge(data,o_i,on=['application_id'],how='left')

In [104]:
print(data.columns)

Index(['status', 'application_id', 'user_age', 'cohort_open_to_notify',
       'cohort_open_to_close', 'cohort_open_month', 'application_month',
       'apply_before_start', 'apply_to_submit', 'num_courses',
       ...
       'Help prepare for an advanced degree',
       'Start a new career in this field', 'python', 'java', 'cplus',
       'probability', 'statistics', 'linear algebra', 'computer science',
       'machine learning'],
      dtype='object', length=173)


In [105]:
# from goals : text
#data
df_goal = df_questions.loc[df_questions['question_id'].isin(['2ad03aaa-1b35-4381-9c43-907b1b4eba67','6afe0061-746b-4bd7-807c-393fe5c7599d'])]

In [106]:
def text_process(x):
    import string
    import nltk
    from nltk.corpus import stopwords
    stopWords = set(stopwords.words('english'))   
    x = x.lower()
    x.translate(str.maketrans('', '', string.punctuation))
    return ' '.join([w for w in x.split() if w not in stopWords])

In [107]:
# mentioned ND
nd_words = ['self driving','ai','artificial intelligence','robotics','vr','ar','machine learning','deep learning']
# technology related?
technology_words = ['computer science','machine learning','deep learning']
# some other words?
other_words = ['knowledge','learn','program','skills','experience','engineering','industry','data','field'
              ,'technology','vision','autonomous','project','projects']

df_goal = df_goal.assign(goal = df_goal['response'].apply(lambda x: text_process(x)))

words = {'nd_words': ['self driving','ai','artificial intelligence','robotics','vr','ar','machine learning','deep learning']
        ,'technology_words': ['computer science','machine learning','deep learning']
        ,'other_words': ['knowledge','learn','program','skills','experience','engineering','industry','data','field'
              ,'technology','vision','autonomous','project','projects']}

In [108]:
for i,word_list in words.items():
    d = df_goal.copy()
    d[i] = 0
    d[i] = df_goal['response'].apply(lambda x: 1 if any(w in x for w in word_list) else 0)
    o = d.groupby('application_id').agg({i:'max'}).reset_index()
    data = pd.merge(data,o,on=['application_id'],how='left')

In [109]:
# split into training and test data set
data = data.fillna(0)
ratio = 0.1
N = data.shape[0]
index = random.sample(range(N),int(ratio*N))
TEST = data[data.index.isin(index)].reset_index()
TRAIN = data[~data.index.isin(index)].reset_index()
X_train = TRAIN.iloc[:,3:]
y_train = TRAIN.iloc[:,1]
X_test = TEST.iloc[:,3:]
y_test = TEST.iloc[:,1]

In [110]:
TRAIN.groupby(['status'])['user_age'].count()

status
0    972
1    765
Name: user_age, dtype: int64

In [111]:
765/(972+765)

0.44041450777202074

In [112]:
TEST.groupby(['status'])['user_age'].count()

status
0    102
1     90
Name: user_age, dtype: int64

In [113]:
X_train.head()

Unnamed: 0,user_age,cohort_open_to_notify,cohort_open_to_close,cohort_open_month,application_month,apply_before_start,apply_to_submit,num_courses,num_course_finished,num_course_enrolled,...,java,cplus,probability,statistics,linear algebra,computer science,machine learning,nd_words,technology_words,other_words
0,178,65,56,4.0,6.0,42,3.0,3.0,0.0,0.0,...,0,1,1,1,1,1,1,1.0,1.0,1.0
1,165,65,56,4.0,6.0,55,1.0,0.0,0.0,0.0,...,0,0,1,1,1,1,1,1.0,0.0,1.0
2,202,65,56,4.0,5.0,18,0.0,7.0,5.0,0.0,...,0,1,1,1,1,1,1,1.0,0.0,1.0
3,169,65,56,4.0,6.0,51,1.0,29.0,0.0,0.0,...,0,1,1,1,1,1,1,1.0,0.0,1.0
4,221,88,70,2.0,4.0,69,0.0,1.0,0.0,0.0,...,0,1,1,1,1,1,1,1.0,1.0,1.0


In [114]:
y_train.head()

0    1
1    0
2    0
3    1
4    0
Name: status, dtype: int64

In [115]:
# build 5 models:
clf1 = linear_model.LogisticRegression()
clf2 = tree.DecisionTreeClassifier()
clf3 = svm.SVC()
clf4 = ensemble.RandomForestClassifier()
clf5 = ensemble.GradientBoostingClassifier()

In [116]:
def train_predict(learner, X_train, y_train, X_test, y_test): 
    from sklearn.metrics import accuracy_score,confusion_matrix
    results = {}
    learner = learner.fit(X_train,y_train)
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    results['acc_train'] = accuracy_score(y_train,predictions_train)
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    results['confusion_matrix_test'] = confusion_matrix(y_test,predictions_test)
    print("{} trained on {} samples.".format(learner.__class__.__name__,X_train.shape[0]))
    return results

In [117]:
# 5-fold CV:
kf = KFold(n_splits = 5, shuffle = False, random_state = 42)
results = {}

for clf in [clf1,clf2,clf3,clf4,clf5]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i,index in zip(range(kf.get_n_splits()),kf.split(X_train)):
        x_training,x_validate = X_train.iloc[index[0],:],X_train.iloc[index[1],:]
        y_training,y_validate = y_train.iloc[index[0]],y_train.iloc[index[1]]
        results[clf_name][i] = train_predict(clf,x_training,y_training,x_validate,y_validate)

LogisticRegression trained on 1389 samples.
LogisticRegression trained on 1389 samples.
LogisticRegression trained on 1390 samples.
LogisticRegression trained on 1390 samples.
LogisticRegression trained on 1390 samples.
DecisionTreeClassifier trained on 1389 samples.
DecisionTreeClassifier trained on 1389 samples.
DecisionTreeClassifier trained on 1390 samples.
DecisionTreeClassifier trained on 1390 samples.
DecisionTreeClassifier trained on 1390 samples.
SVC trained on 1389 samples.
SVC trained on 1389 samples.
SVC trained on 1390 samples.
SVC trained on 1390 samples.
SVC trained on 1390 samples.
RandomForestClassifier trained on 1389 samples.
RandomForestClassifier trained on 1389 samples.
RandomForestClassifier trained on 1390 samples.
RandomForestClassifier trained on 1390 samples.
RandomForestClassifier trained on 1390 samples.
GradientBoostingClassifier trained on 1389 samples.
GradientBoostingClassifier trained on 1389 samples.
GradientBoostingClassifier trained on 1390 samples.

In [121]:
for k,learner in enumerate(results.keys()):
    for i in np.arange(5):
        print(k,learner,'acc_test',np.mean(results[learner][i]['acc_test']))

0 LogisticRegression acc_test 0.603448275862
0 LogisticRegression acc_test 0.543103448276
0 LogisticRegression acc_test 0.608069164265
0 LogisticRegression acc_test 0.610951008646
0 LogisticRegression acc_test 0.608069164265
1 DecisionTreeClassifier acc_test 0.505747126437
1 DecisionTreeClassifier acc_test 0.522988505747
1 DecisionTreeClassifier acc_test 0.524495677233
1 DecisionTreeClassifier acc_test 0.524495677233
1 DecisionTreeClassifier acc_test 0.489913544669
2 SVC acc_test 0.537356321839
2 SVC acc_test 0.51724137931
2 SVC acc_test 0.567723342939
2 SVC acc_test 0.527377521614
2 SVC acc_test 0.536023054755
3 RandomForestClassifier acc_test 0.57183908046
3 RandomForestClassifier acc_test 0.534482758621
3 RandomForestClassifier acc_test 0.553314121037
3 RandomForestClassifier acc_test 0.582132564841
3 RandomForestClassifier acc_test 0.57060518732
4 GradientBoostingClassifier acc_test 0.560344827586
4 GradientBoostingClassifier acc_test 0.537356321839
4 GradientBoostingClassifier acc

In [186]:
val = {}
for k, learner in enumerate(results.keys()):
    val[learner] = [results[learner][j]['acc_test'] for j in results[learner]]

for learner in val:
    print(learner,"mean acc: ",sum(val[learner])/len(val[learner]))

LogisticRegression mean acc:  0.594728212263
DecisionTreeClassifier mean acc:  0.513528106264
SVC mean acc:  0.537144324092
RandomForestClassifier mean acc:  0.562474742456
GradientBoostingClassifier mean acc:  0.578041670807


In [122]:
# Logistic Regression
clf = linear_model.LogisticRegression()
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

0.578125

In [123]:
features = X_train.describe().T
features

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_age,1737.0,224.446747,35.986135,164.0,193.0,225.0,252.0,288.0
cohort_open_to_notify,1737.0,78.347150,11.353953,65.0,65.0,88.0,88.0,88.0
cohort_open_to_close,1737.0,64.124352,6.911102,56.0,56.0,70.0,70.0,70.0
cohort_open_month,1737.0,2.839378,0.987300,2.0,2.0,2.0,4.0,4.0
application_month,1737.0,4.253886,1.233891,2.0,3.0,4.0,5.0,6.0
apply_before_start,1737.0,39.536557,20.141806,0.0,23.0,43.0,55.0,70.0
apply_to_submit,1737.0,4.974093,11.474020,0.0,0.0,0.0,2.0,67.0
num_courses,1737.0,5.654001,9.373308,0.0,0.0,2.0,7.0,108.0
num_course_finished,1737.0,0.696603,4.040628,0.0,0.0,0.0,0.0,101.0
num_course_enrolled,1737.0,0.263673,2.460663,0.0,0.0,0.0,0.0,92.0


In [125]:
logit = sm.Logit(y_train,X_train)
result = logit.fit()
print(result.summary())

         Current function value: inf
         Iterations: 35


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


LinAlgError: Singular matrix

In [187]:
# Neural Network - keras
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

Using TensorFlow backend.
  return f(*args, **kwds)


In [188]:
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [207]:
# Build the model architecture
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(.1))
model.add(Dense(64, activation='relu'))
model.add(Dropout(.1))
model.add(Dense(2, activation='softmax'))

# Compiling the model using a loss function and an optimizer.
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 128)               22400     
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_11 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 2)                 130       
Total params: 30,786
Trainable params: 30,786
Non-trainable params: 0
_________________________________________________________________


In [208]:
# Training the model
model.fit(X_train.as_matrix(), y_train, epochs=300, batch_size=10, verbose=0)

<keras.callbacks.History at 0x120d8b198>

In [209]:
score = model.evaluate(X_test.as_matrix(), y_test, verbose=0)
print("Accuracy: ", score[1])

Accuracy:  0.53125
