# Model1 - Amissions: Accept or Reject?

## Based on findings from feature selections work files: 
 * successful-student-profile/feature-selection-1.ipynb 
 * successful-student-profile/feature-selection-2-with-clustering.ipynb
 * successful-student-profile/feature-selection-3-with-ND-separated.ipynb
 * successful-student-profile/apps-goal-text-analysis-NB.ipynb

## Let's build our first model!

Output: % of graduation of each students
 * assumptions: our data visibility stops after student submit the application
       means we know when the cohort is open, close, start; when student apply, submit; but we don't know when we are going to accept/reject and notify the student, when the student going to put down payment

Approach: ND separated; application type separated

Methods: Logistics Regression / Decision Tree

Features:
 * user_age : account created with Udacity
 * cohort_open_to_notify
 * cohort_open_to_close
 * apply_before_start : days student apply before cohort start
 * apply_to_submit : days from apply to submit application
 * num_course_enrolled
 
 * education
 * employment
 * professional experience
 * python, java, c++, porbability, statistics, linear algebra, computer science, machine learning
 
 * mentioned programming skills / technology in goal

In [1]:
import psycopg2
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
import random
import re
import matplotlib.pyplot as plt
%matplotlib inline

  from pandas.core import datetools


In [2]:
laurel = open('conn_laurel.txt', 'r')
hardy = open('conn_hardy.txt', 'r')
conn_laurel = psycopg2.connect(laurel.read())
conn_hardy = psycopg2.connect(hardy.read())

sql_apps = open('successful-student-profile-apps.sql', 'r')
sql_courses = open('successful-student-profile-courses.sql', 'r')
sql_questions = open('successful-student-profile-questions.sql', 'r')

df_apps = pd.read_sql(sql_apps.read(),conn_laurel)
df_courses = pd.read_sql(sql_courses.read(),conn_hardy)
df_questions = pd.read_sql(sql_questions.read(),conn_laurel)

In [3]:
df_apps.groupby(['nd_key','application_status'])['application_id'].count()

nd_key              application_status
nd001-connect       paid                     1
nd002-connect       paid                    20
nd004-connect-reno  graduated                4
                    paid                    16
nd009-connect       paid                    27
nd013               graduated                4
                    paid                  2663
                    term completed         780
nd209               graduated                4
                    paid                  1160
                    term completed         117
nd889               paid                  1175
                    term completed          29
Name: application_id, dtype: int64

In [4]:
df = pd.merge(df_apps,df_courses,on=['application_id','cohort_id','applicant_id','nd_key'],how='left')

## Based on the pivot above, let's pick nd013 as pilot to model.

In [5]:
df_nd013 = df.query("nd_key == 'nd013'")
df_nd013 = df_nd013.assign(status = df_nd013['application_status'].map({'graduated':1,'term completed':1,'paid':0}))
print(df_nd013.shape)
df_nd013.columns

(3447, 24)


Index(['application_id', 'cohort_id', 'applicant_id', 'nd_key',
       'application_type', 'applicant_country', 'applicant_geo',
       'application_status', 'user_age', 'cohort_open_to_notify',
       'cohort_open_to_close', 'cohort_open_month', 'application_month',
       'apply_before_start', 'apply_to_submit', 'submit_to_decision',
       'submit_to_notify', 'notify_to_pay', 'price', 'num_courses',
       'num_course_finished', 'num_course_enrolled', 'user_study_age',
       'status'],
      dtype='object')

In [6]:
df_nd013.groupby(['cohort_id','application_status'])['application_id'].count()

cohort_id  application_status
18         graduated               4
           paid                  619
           term completed        492
220        paid                  322
47         paid                  544
           term completed        266
88         paid                  781
           term completed         19
89         paid                  397
           term completed          3
Name: application_id, dtype: int64

In [7]:
# among cohorts 18,47,88,89,220, cohort 18 and 47 are closed. cohort 88 is closing in Dec
df_nd013 = df_nd013.loc[(df_nd013['cohort_id']=='18') | (df_nd013['cohort_id']=='47')]
df_nd013.shape

(1925, 24)

In [8]:
# only columns we care about
cols = ['status','application_id','user_age','cohort_open_to_notify','cohort_open_to_close'
        ,'apply_before_start','apply_to_submit','num_course_enrolled']
df_nd013 = df_nd013[cols]

In [9]:
# from questions_responses
# categorical
df1 = df_nd013
categories = {'education':'48e7b492-62b4-4d99-b596-80d68f2966ae'
             ,'employment':'fba3666b-db04-46e9-8f3d-2a303f13e0a5'
             ,'professional_experience':'6967091c-09c6-4455-9f1e-d0de318bacc5'
             ,'goal':'779c3b6c-3648-423b-8d3f-8a4f36f23e2a'}
for i,qr_id in categories.items():
    d = df_questions.copy()
    d[i] = 0
    d[i] = np.where(d['question_id']==qr_id,d['response'],'0')
    o_i = d.groupby('application_id').agg({i:'max'}).reset_index()
    o_i = o_i[o_i.iloc[:,1] != '0']
    o_i = o_i[o_i.iloc[:,1] != 'Other']
    #df1 = pd.merge(df1,o_i,on=['application_id'],how='inner')
    d_i = pd.get_dummies(o_i[i])
    df_i = pd.concat([o_i['application_id'],d_i],axis=1)
    df1 = pd.merge(df1,df_i,on='application_id',how='left')

In [10]:
# from questions_responses
# programming languages
data = df1
for i in (['python','java','cplus','probability','statistics','linear algebra','computer science','machine learning']):
    if i == 'cplus':
        j = 'c\+\+' 
    else:
        j = i
    d = df_questions.copy()
    d[i] = 0
    d[i] = np.where(d['response'].str.lower().str.contains(j),1,d[i])
    d[i] = np.where(d['question_prompt'].str.lower().str.contains(j)&d['response'].str.match('.*[1-9].*'),1,d[i])
    o_i = d.groupby('application_id').agg({i:'max'}).reset_index()
    data = pd.merge(data,o_i,on=['application_id'],how='left')

In [11]:
print(data.columns)

Index(['status', 'application_id', 'user_age', 'cohort_open_to_notify',
       'cohort_open_to_close', 'apply_before_start', 'apply_to_submit',
       'num_course_enrolled', 'Bachelor's degree', 'High school degree',
       'Master's degree', 'PhD', 'Self employed', 'Student', 'Unemployed',
       'Yes, full time', 'Yes, part time or contract', '1-2+ years',
       '3-5 years', '5+ years', '< 1 year',
       'I have no professional experience',
       'General interest in the topic (personal growth and enrichment)',
       'Grow skills for my current role',
       'Help move from academia to industry',
       'Help prepare for an advanced degree',
       'Start a new career in this field', 'python', 'java', 'cplus',
       'probability', 'statistics', 'linear algebra', 'computer science',
       'machine learning'],
      dtype='object')


In [14]:
# from goals : text
#data
df_goal = df_questions.loc[df_questions['question_id'].isin(['2ad03aaa-1b35-4381-9c43-907b1b4eba67','6afe0061-746b-4bd7-807c-393fe5c7599d'])]

In [15]:
d = df_goal.copy()
d['goal'] = 0
# mentioned ND
education_words = ['self','driving','car','cars'] #['ai','artificial','intelligence']
# technology related?
technology_words = ['computer','science','machine','learning','deep','learning','learn']
d['goal'] = np.where(d['response'].str.lower().str.contains(j),1,d[i])
d['goal'] = np.where(d['question_prompt'].str.lower().str.contains(j)&d['response'].str.match('.*[1-9].*'),1,d[i])
o = d.groupby('application_id').agg({i:'max'}).reset_index()
data = pd.merge(data,o,on=['application_id'],how='left')

Unnamed: 0,application_id,cohort_id,applicant_id,nd_key,question_id,question_prompt,response,q_r
0,0001883b-f88d-4662-afea-717fbba580a0,129,4746120864,nd889,2ad03aaa-1b35-4381-9c43-907b1b4eba67,What do you hope to accomplish through this pr...,I am currently writing Machine Learning Code a...,What do you hope to accomplish through this pr...
10,0002c98e-89a9-4d54-a48d-ce3586bc10b2,18,10469833750,nd013,2ad03aaa-1b35-4381-9c43-907b1b4eba67,What do you hope to accomplish through this pr...,I'm currently studying master of mechanical/au...,What do you hope to accomplish through this pr...
19,00036e7d-377a-4a85-90cb-a51d870b167c,ntc-2,10567564644,nd001sa,2ad03aaa-1b35-4381-9c43-907b1b4eba67,What do you hope to accomplish through this pr...,The technical field is the most important area...,What do you hope to accomplish through this pr...
33,0006fc3d-bd9f-465a-8600-7a87a659c5ee,283,11052205644,nd889,2ad03aaa-1b35-4381-9c43-907b1b4eba67,What do you hope to accomplish through this pr...,I completed a biomedical engineer bachelors de...,What do you hope to accomplish through this pr...
64,000c3078-246a-4175-93da-8d51db496263,88,10411698310,nd013,2ad03aaa-1b35-4381-9c43-907b1b4eba67,What do you hope to accomplish through this pr...,"I hope to get engaged in such rich field, as I...",What do you hope to accomplish through this pr...


In [100]:
# split into training and test data set
#data = data.fillna(0)
ratio = 0.1
N = data.shape[0]
index = random.sample(range(N),int(ratio*N))
TEST = data[data.index.isin(index)]
TRAIN = data[~data.index.isin(index)]
X_train = TRAIN.iloc[:,2:]
y_train = TRAIN.iloc[:,0]
X_test = TEST.iloc[:,2:]
y_test = TEST.iloc[:,0]

In [101]:
clf = linear_model.LogisticRegression()
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

0.63376623376623376

In [102]:
features = X_train.describe().T
features

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_age,1540.0,208.45974,35.923955,147.0,178.0,209.0,237.0,271.0
cohort_open_to_notify,1540.0,78.575974,11.314745,65.0,65.0,88.0,88.0,88.0
cohort_open_to_close,1540.0,64.263636,6.887236,56.0,56.0,70.0,70.0,70.0
apply_before_start,1540.0,39.233117,20.229054,-1.0,22.0,42.0,55.0,70.0
apply_to_submit,1540.0,5.09026,11.56164,0.0,0.0,0.0,2.0,67.0
num_course_enrolled,1540.0,0.269481,2.581381,0.0,0.0,0.0,0.0,92.0
Bachelor's degree,1540.0,0.317532,0.465668,0.0,0.0,0.0,1.0,1.0
High school degree,1540.0,0.034416,0.182353,0.0,0.0,0.0,0.0,1.0
Master's degree,1540.0,0.49026,0.500068,0.0,0.0,0.0,1.0,1.0
PhD,1540.0,0.136364,0.343286,0.0,0.0,0.0,0.0,1.0


In [103]:
logit = sm.Logit(y_train,X_train)
result = logit.fit()
print(result.summary())

         Current function value: 0.639623
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                 status   No. Observations:                 1540
Model:                          Logit   Df Residuals:                     1508
Method:                           MLE   Df Model:                           31
Date:                Tue, 14 Nov 2017   Pseudo R-squ.:                 0.04413
Time:                        15:37:26   Log-Likelihood:                -985.02
converged:                      False   LL-Null:                       -1030.5
                                        LLR p-value:                 8.389e-08
                                                                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------------
user_age                                

  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [104]:
from keras.models import Sequential
from keras.layers import Conv2D

model = Sequential()
model.add(Conv2D(filters=32, kernel_size=3, strides=2, padding='same', 
    activation='relu', input_shape=(128, 128, 3)))
model.summary()

Using TensorFlow backend.
  return f(*args, **kwds)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 64, 64, 32)        896       
Total params: 896
Trainable params: 896
Non-trainable params: 0
_________________________________________________________________
