# Model1 - Amissions: Accept or Reject?

## Based on findings from feature selections work files: 
 * successful-student-profile/feature-selection-1.ipynb 
 * successful-student-profile/feature-selection-2-with-clustering.ipynb
 * successful-student-profile/feature-selection-3-with-ND-separated.ipynb
 * successful-student-profile/apps-goal-text-analysis-NB.ipynb

## Let's build our first model!

Output: % of graduation of each students
 * assumptions: our data visibility stops after student submit the application
       means we know when the cohort is open, close, start; when student apply, submit; but we don't know when we are going to accept/reject and notify the student, when the student going to put down payment

Approach: ND separated; application type separated

Methods: Logistics Regression / Decision Tree

Features:
 * user_age : account created with Udacity
 * cohort_open_to_notify
 * cohort_open_to_close
 * apply_before_start : days student apply before cohort start
 * apply_to_submit : days from apply to submit application
 * num_course_enrolled
 
 * education
 * employment
 * professional experience
 * python, java, c++, porbability, statistics, linear algebra, computer science, machine learning
 
 * mentioned programming skills / technology in goal

In [25]:
import psycopg2
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn import linear_model,tree,svm,ensemble
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
import random
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
laurel = open('conn_laurel.txt', 'r')
hardy = open('conn_hardy.txt', 'r')
conn_laurel = psycopg2.connect(laurel.read())
conn_hardy = psycopg2.connect(hardy.read())

sql_apps = open('successful-student-profile-apps.sql', 'r')
sql_courses = open('successful-student-profile-courses.sql', 'r')
sql_questions = open('successful-student-profile-questions.sql', 'r')

df_apps = pd.read_sql(sql_apps.read(),conn_laurel)
df_courses = pd.read_sql(sql_courses.read(),conn_hardy)
df_questions = pd.read_sql(sql_questions.read(),conn_laurel)

In [3]:
df_apps.groupby(['nd_key','application_status'])['application_id'].count()

nd_key              application_status
nd001-connect       paid                     1
nd002-connect       paid                    20
nd004-connect-reno  graduated                4
                    paid                    16
nd009-connect       paid                    27
nd013               graduated                4
                    paid                  2663
                    term completed         780
nd209               graduated                4
                    paid                  1160
                    term completed         117
nd889               paid                  1175
                    term completed          29
Name: application_id, dtype: int64

In [4]:
df = pd.merge(df_apps,df_courses,on=['application_id','cohort_id','applicant_id','nd_key'],how='left')

## Based on the pivot above, let's pick nd013 as pilot to model.

In [87]:
df_nd013 = df.query("nd_key == 'nd013'")
df_nd013 = df_nd013.assign(status = df_nd013['application_status'].map({'graduated':1,'term completed':1,'paid':0}))
print(df_nd013.shape)
df_nd013.columns

(3447, 24)


Index(['application_id', 'cohort_id', 'applicant_id', 'nd_key',
       'application_type', 'applicant_country', 'applicant_geo',
       'application_status', 'user_age', 'cohort_open_to_notify',
       'cohort_open_to_close', 'cohort_open_month', 'application_month',
       'apply_before_start', 'apply_to_submit', 'submit_to_decision',
       'submit_to_notify', 'notify_to_pay', 'price', 'num_courses',
       'num_course_finished', 'num_course_enrolled', 'user_study_age',
       'status'],
      dtype='object')

In [88]:
df_nd013.groupby(['cohort_id','application_status'])['application_id'].count()

cohort_id  application_status
18         graduated               4
           paid                  619
           term completed        492
220        paid                  322
47         paid                  544
           term completed        266
88         paid                  781
           term completed         19
89         paid                  397
           term completed          3
Name: application_id, dtype: int64

In [89]:
# among cohorts 18,47,88,89,220, cohort 18 and 47 are closed. cohort 88 is closing in Dec
df_nd013 = df_nd013.loc[(df_nd013['cohort_id']=='18') | (df_nd013['cohort_id']=='47')]
df_nd013.shape

(1925, 24)

In [90]:
# only columns we care about
cols = ['status','application_id','user_age','cohort_open_to_notify','cohort_open_to_close'
        ,'apply_before_start','apply_to_submit','num_course_enrolled']
df_nd013 = df_nd013[cols]

In [91]:
# from questions_responses
# categorical
df1 = df_nd013
categories = {'education':'48e7b492-62b4-4d99-b596-80d68f2966ae'
             ,'employment':'fba3666b-db04-46e9-8f3d-2a303f13e0a5'
             ,'professional_experience':'6967091c-09c6-4455-9f1e-d0de318bacc5'
             ,'goal':'779c3b6c-3648-423b-8d3f-8a4f36f23e2a'}
for i,qr_id in categories.items():
    d = df_questions.copy()
    d[i] = 0
    d[i] = np.where(d['question_id']==qr_id,d['response'],'0')
    o_i = d.groupby('application_id').agg({i:'max'}).reset_index()
    o_i = o_i[o_i.iloc[:,1] != '0']
    o_i = o_i[o_i.iloc[:,1] != 'Other']
    #df1 = pd.merge(df1,o_i,on=['application_id'],how='inner')
    d_i = pd.get_dummies(o_i[i])
    df_i = pd.concat([o_i['application_id'],d_i],axis=1)
    df1 = pd.merge(df1,df_i,on='application_id',how='left')

In [92]:
# from questions_responses
# programming languages
data = df1
for i in (['python','java','cplus','probability','statistics','linear algebra','computer science','machine learning']):
    if i == 'cplus':
        j = 'c\+\+' 
    else:
        j = i
    d = df_questions.copy()
    d[i] = 0
    d[i] = np.where(d['response'].str.lower().str.contains(j),1,d[i])
    d[i] = np.where(d['question_prompt'].str.lower().str.contains(j)&d['response'].str.match('.*[1-9].*'),1,d[i])
    o_i = d.groupby('application_id').agg({i:'max'}).reset_index()
    data = pd.merge(data,o_i,on=['application_id'],how='left')

In [93]:
print(data.columns)

Index(['status', 'application_id', 'user_age', 'cohort_open_to_notify',
       'cohort_open_to_close', 'apply_before_start', 'apply_to_submit',
       'num_course_enrolled', 'Bachelor's degree', 'High school degree',
       'Master's degree', 'PhD', 'Self employed', 'Student', 'Unemployed',
       'Yes, full time', 'Yes, part time or contract', '1-2+ years',
       '3-5 years', '5+ years', '< 1 year',
       'I have no professional experience',
       'General interest in the topic (personal growth and enrichment)',
       'Grow skills for my current role',
       'Help move from academia to industry',
       'Help prepare for an advanced degree',
       'Start a new career in this field', 'python', 'java', 'cplus',
       'probability', 'statistics', 'linear algebra', 'computer science',
       'machine learning'],
      dtype='object')


In [94]:
# from goals : text
#data
df_goal = df_questions.loc[df_questions['question_id'].isin(['2ad03aaa-1b35-4381-9c43-907b1b4eba67','6afe0061-746b-4bd7-807c-393fe5c7599d'])]

In [95]:
def text_process(x):
    import string
    import nltk
    from nltk.corpus import stopwords
    stopWords = set(stopwords.words('english'))   
    x = x.lower()
    x.translate(str.maketrans('', '', string.punctuation))
    return ' '.join([w for w in x.split() if w not in stopWords])

In [96]:
# mentioned ND
nd_words = ['self driving','ai','artificial intelligence','robotics','vr','ar','machine learning','deep learning']
# technology related?
technology_words = ['computer science','machine learning','deep learning']
# some other words?
other_words = ['knowledge','learn','program','skills','experience','engineering','industry','data','field'
              ,'technology','vision','autonomous','project','projects']

df_goal = df_goal.assign(goal = df_goal['response'].apply(lambda x: text_process(x)))

words = {'nd_words': ['self driving','ai','artificial intelligence','robotics','vr','ar','machine learning','deep learning']
        ,'technology_words': ['computer science','machine learning','deep learning']
        ,'other_words': ['knowledge','learn','program','skills','experience','engineering','industry','data','field'
              ,'technology','vision','autonomous','project','projects']}

In [97]:
for i,word_list in words.items():
    d = df_goal.copy()
    d[i] = 0
    d[i] = df_goal['response'].apply(lambda x: 1 if any(w in x for w in word_list) else 0)
    o = d.groupby('application_id').agg({i:'max'}).reset_index()
    data = pd.merge(data,o,on=['application_id'],how='left')

In [135]:
# split into training and test data set
data = data.fillna(0)
ratio = 0.1
N = data.shape[0]
index = random.sample(range(N),int(ratio*N))
TEST = data[data.index.isin(index)].reset_index()
TRAIN = data[~data.index.isin(index)].reset_index()
X_train = TRAIN.iloc[:,3:]
y_train = TRAIN.iloc[:,1]
X_test = TEST.iloc[:,3:]
y_test = TEST.iloc[:,1]

In [113]:
TRAIN.groupby(['status'])['user_age'].count()

status
0    1034
1     699
Name: user_age, dtype: int64

In [160]:
699/1034

0.6760154738878144

In [114]:
TEST.groupby(['status'])['user_age'].count()

status
0    129
1     63
Name: user_age, dtype: int64

In [137]:
X_train.head()

Unnamed: 0,user_age,cohort_open_to_notify,cohort_open_to_close,apply_before_start,apply_to_submit,num_course_enrolled,Bachelor's degree,High school degree,Master's degree,PhD,...,java,cplus,probability,statistics,linear algebra,computer science,machine learning,nd_words,technology_words,other_words
0,152,65,56,52,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,1,1,1,1,1,1.0,0.0,1.0
1,233,88,70,41,1.0,6.0,1.0,0.0,0.0,0.0,...,0,1,1,1,1,1,1,1.0,1.0,1.0
2,252,65,56,49,0.0,0.0,1.0,0.0,0.0,0.0,...,0,1,1,1,1,1,1,1.0,0.0,1.0
3,217,88,70,57,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,1,1,1,1,1,1.0,0.0,1.0
4,168,65,56,36,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,1,1,1,1,1,1.0,0.0,1.0


In [136]:
y_train.head()

0    1
1    1
2    0
3    1
4    0
Name: status, dtype: int64

In [116]:
# build 5 models:
clf1 = linear_model.LogisticRegression()
clf2 = tree.DecisionTreeClassifier()
clf3 = svm.SVC()
clf4 = ensemble.RandomForestClassifier()
clf5 = ensemble.GradientBoostingClassifier()

In [138]:
def train_predict(learner, X_train, y_train, X_test, y_test): 
    from sklearn.metrics import accuracy_score,confusion_matrix
    results = {}
    learner = learner.fit(X_train,y_train)
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    results['acc_train'] = accuracy_score(y_train,predictions_train)
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    results['confusion_matrix_test'] = confusion_matrix(y_test,predictions_test)
    print("{} trained on {} samples.".format(learner.__class__.__name__,X_train.shape[0]))
    return results

In [139]:
# 5-fold CV:
kf = KFold(n_splits = 5, shuffle = False, random_state = 42)
results = {}

for clf in [clf1,clf2,clf3,clf4,clf5]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i,index in zip(range(kf.get_n_splits()),kf.split(X_train)):
        x_training,x_validate = X_train.iloc[index[0],:],X_train.iloc[index[1],:]
        y_training,y_validate = y_train.iloc[index[0]],y_train.iloc[index[1]]
        results[clf_name][i] = train_predict(clf,x_training,y_training,x_validate,y_validate)

LogisticRegression trained on 1386 samples.
LogisticRegression trained on 1386 samples.
LogisticRegression trained on 1386 samples.
LogisticRegression trained on 1387 samples.
LogisticRegression trained on 1387 samples.
DecisionTreeClassifier trained on 1386 samples.
DecisionTreeClassifier trained on 1386 samples.
DecisionTreeClassifier trained on 1386 samples.
DecisionTreeClassifier trained on 1387 samples.
DecisionTreeClassifier trained on 1387 samples.
SVC trained on 1386 samples.
SVC trained on 1386 samples.
SVC trained on 1386 samples.
SVC trained on 1387 samples.
SVC trained on 1387 samples.
RandomForestClassifier trained on 1386 samples.
RandomForestClassifier trained on 1386 samples.
RandomForestClassifier trained on 1386 samples.
RandomForestClassifier trained on 1387 samples.
RandomForestClassifier trained on 1387 samples.
GradientBoostingClassifier trained on 1386 samples.
GradientBoostingClassifier trained on 1386 samples.
GradientBoostingClassifier trained on 1386 samples.

In [142]:
results

{'DecisionTreeClassifier': {0: {'acc_test': 0.5389048991354467,
   'acc_train': 0.99494949494949492,
   'confusion_matrix_test': array([[130,  82],
          [ 78,  57]])},
  1: {'acc_test': 0.51585014409221897,
   'acc_train': 0.99062049062049062,
   'confusion_matrix_test': array([[128,  78],
          [ 90,  51]])},
  2: {'acc_test': 0.51008645533141206,
   'acc_train': 0.99494949494949492,
   'confusion_matrix_test': array([[117, 105],
          [ 65,  60]])},
  3: {'acc_test': 0.53468208092485547,
   'acc_train': 0.99351117519826959,
   'confusion_matrix_test': array([[123,  89],
          [ 72,  62]])},
  4: {'acc_test': 0.49710982658959535,
   'acc_train': 0.99134823359769286,
   'confusion_matrix_test': array([[124,  84],
          [ 90,  48]])}},
 'GradientBoostingClassifier': {0: {'acc_test': 0.57636887608069165,
   'acc_train': 0.71356421356421351,
   'confusion_matrix_test': array([[175,  37],
          [110,  25]])},
  1: {'acc_test': 0.56195965417867433,
   'acc_train': 0

In [156]:
for k, learner in enumerate(results.keys()):
    for j, metric in enumerate(['acc_test']):
        print(k,learner,j,metric,np.mean(results[learner][j][metric]))

0 LogisticRegression 0 acc_test 0.576368876081
1 DecisionTreeClassifier 0 acc_test 0.538904899135
2 SVC 0 acc_test 0.593659942363
3 RandomForestClassifier 0 acc_test 0.590778097983
4 GradientBoostingClassifier 0 acc_test 0.576368876081


In [153]:
np.mean([0.576368876081,0.579250720461,0.593659942363,0.632947976879,0.592485549133])

0.5949426129833999

In [159]:
for k,learner in enumerate(results.keys()):
    for i in np.arange(5):
        print(k,learner,'acc_test',np.mean(results[learner][i]['acc_test']))

0 LogisticRegression acc_test 0.576368876081
0 LogisticRegression acc_test 0.579250720461
0 LogisticRegression acc_test 0.593659942363
0 LogisticRegression acc_test 0.632947976879
0 LogisticRegression acc_test 0.592485549133
1 DecisionTreeClassifier acc_test 0.538904899135
1 DecisionTreeClassifier acc_test 0.515850144092
1 DecisionTreeClassifier acc_test 0.510086455331
1 DecisionTreeClassifier acc_test 0.534682080925
1 DecisionTreeClassifier acc_test 0.49710982659
2 SVC acc_test 0.593659942363
2 SVC acc_test 0.567723342939
2 SVC acc_test 0.599423631124
2 SVC acc_test 0.589595375723
2 SVC acc_test 0.57225433526
3 RandomForestClassifier acc_test 0.590778097983
3 RandomForestClassifier acc_test 0.567723342939
3 RandomForestClassifier acc_test 0.579250720461
3 RandomForestClassifier acc_test 0.606936416185
3 RandomForestClassifier acc_test 0.606936416185
4 GradientBoostingClassifier acc_test 0.576368876081
4 GradientBoostingClassifier acc_test 0.561959654179
4 GradientBoostingClassifier ac

In [17]:
clf = linear_model.LogisticRegression()
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

0.578125

In [18]:
features = X_train.describe().T
features

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_age,1733.0,207.871898,36.328432,148.0,175.0,208.0,236.0,272.0
cohort_open_to_notify,1733.0,78.271783,11.365972,65.0,65.0,88.0,88.0,88.0
cohort_open_to_close,1733.0,64.078477,6.918418,56.0,56.0,70.0,70.0,70.0
apply_before_start,1733.0,39.718407,19.945199,0.0,23.0,43.0,55.0,70.0
apply_to_submit,1733.0,4.726486,11.053261,0.0,0.0,0.0,2.0,67.0
num_course_enrolled,1733.0,0.275822,2.479495,0.0,0.0,0.0,0.0,92.0
Bachelor's degree,1733.0,0.313329,0.463981,0.0,0.0,0.0,1.0,1.0
High school degree,1733.0,0.037507,0.190056,0.0,0.0,0.0,0.0,1.0
Master's degree,1733.0,0.491056,0.500064,0.0,0.0,0.0,1.0,1.0
PhD,1733.0,0.13618,0.343079,0.0,0.0,0.0,0.0,1.0


In [19]:
logit = sm.Logit(y_train,X_train)
result = logit.fit()
print(result.summary())

         Current function value: 0.641564
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                 status   No. Observations:                 1733
Model:                          Logit   Df Residuals:                     1698
Method:                           MLE   Df Model:                           34
Date:                Wed, 15 Nov 2017   Pseudo R-squ.:                 0.04532
Time:                        13:42:25   Log-Likelihood:                -1111.8
converged:                      False   LL-Null:                       -1164.6
                                        LLR p-value:                 2.954e-09
                                                                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------------
user_age                                

