# DS-SF-27 | Unit Project 3: Basic Machine Learning Modeling

In this project, you will perform a logistic regression on the admissions data we've been working with in Unit Projects 1 and 2.

In [382]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.notebook_repr_html', True)
import pylab as pl

import statsmodels.formula.api as smf
from math import pi
from sklearn import linear_model

In [383]:
df = pd.read_csv(os.path.join('..', '..', 'dataset', 'ucla-admissions.csv'))
df.dropna(inplace = True)

df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.00,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0
...,...,...,...,...
395,0,620.0,4.00,2.0
396,0,560.0,3.04,3.0
397,0,460.0,2.63,2.0
398,0,700.0,3.65,2.0


In [384]:
df.describe()

Unnamed: 0,admit,gre,gpa,prestige
count,397.0,397.0,397.0,397.0
mean,0.31738,587.858942,3.392242,2.488665
std,0.466044,115.717787,0.380208,0.947083
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.4,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [385]:
df.std()

admit         0.466044
gre         115.717787
gpa           0.380208
prestige      0.947083
dtype: float64

## Part A.  Frequency Table

> ### Question 1.  Create a frequency table for `prestige` and whether or not an applicant was admitted.

In [386]:
# TODO
pd.crosstab(df.admit,df.prestige,dropna = False)

prestige,1.0,2.0,3.0,4.0
admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,28,95,93,55
1,33,53,28,12


## Part B.  Variable Transformations

> ### Question 2.  Create a one-hot encoding for `prestige`.

In [387]:
# TODO
oh_df = pd.get_dummies(df.prestige, prefix = "Prestige")
oh_df

Unnamed: 0,Prestige_1.0,Prestige_2.0,Prestige_3.0,Prestige_4.0
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
395,0.0,1.0,0.0,0.0
396,0.0,0.0,1.0,0.0
397,0.0,1.0,0.0,0.0
398,0.0,1.0,0.0,0.0


In [388]:
oh_df.rename(columns = {'Prestige_1.0': 'Prestige_1',
                           'Prestige_2.0': 'Prestige_2',
                           'Prestige_3.0': 'Prestige_3',
                           'Prestige_4.0': 'Prestige_4'}, inplace = True)
oh_df

Unnamed: 0,Prestige_1,Prestige_2,Prestige_3,Prestige_4
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
395,0.0,1.0,0.0,0.0
396,0.0,0.0,1.0,0.0
397,0.0,1.0,0.0,0.0
398,0.0,1.0,0.0,0.0


> ### Question 3.  How many of these binary variables do we need for modeling?

Answer: We would need all of these binary variables for modeling because we do not know if they are statistically significant or not... Upon further testing we can determine which variables provide the best model... 

In [389]:
df.corr()


Unnamed: 0,admit,gre,gpa,prestige
admit,1.0,0.181202,0.174116,-0.243563
gre,0.181202,1.0,0.382408,-0.124533
gpa,0.174116,0.382408,1.0,-0.060976
prestige,-0.243563,-0.124533,-0.060976,1.0


> ### Question 4.  Why are we doing this?

Answer: We are doing this so we can convert the categorical variables in Prestige into a binary variables to create a better model ... by converting the prestige values into binary variables we are able to obtain releveant coefficients when one of the values is 1, whereas they were arbritrally assigned values before.. The prestige column gave us no statistical significance before this one hot encoding.. 

> ### Question 5.  Add all these binary variables in the dataset and remove the now redundant `prestige` feature.

In [390]:
# TODO
df = df.join([oh_df])

In [391]:
df

Unnamed: 0,admit,gre,gpa,prestige,Prestige_1,Prestige_2,Prestige_3,Prestige_4
0,0,380.0,3.61,3.0,0.0,0.0,1.0,0.0
1,1,660.0,3.67,3.0,0.0,0.0,1.0,0.0
2,1,800.0,4.00,1.0,1.0,0.0,0.0,0.0
3,1,640.0,3.19,4.0,0.0,0.0,0.0,1.0
4,0,520.0,2.93,4.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
395,0,620.0,4.00,2.0,0.0,1.0,0.0,0.0
396,0,560.0,3.04,3.0,0.0,0.0,1.0,0.0
397,0,460.0,2.63,2.0,0.0,1.0,0.0,0.0
398,0,700.0,3.65,2.0,0.0,1.0,0.0,0.0


In [392]:
df = df.drop('prestige',axis = 1)

In [393]:
df

Unnamed: 0,admit,gre,gpa,Prestige_1,Prestige_2,Prestige_3,Prestige_4
0,0,380.0,3.61,0.0,0.0,1.0,0.0
1,1,660.0,3.67,0.0,0.0,1.0,0.0
2,1,800.0,4.00,1.0,0.0,0.0,0.0
3,1,640.0,3.19,0.0,0.0,0.0,1.0
4,0,520.0,2.93,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
395,0,620.0,4.00,0.0,1.0,0.0,0.0
396,0,560.0,3.04,0.0,0.0,1.0,0.0
397,0,460.0,2.63,0.0,1.0,0.0,0.0
398,0,700.0,3.65,0.0,1.0,0.0,0.0


## Part C.  Hand calculating odds ratios

Let's develop our intuition about expected outcomes by hand calculating odds ratios.

> ### Question 6.  Create a frequency table for `prestige = 1` and whether or not an applicant was admitted.

In [394]:
# TODO
pd.crosstab(df.Prestige_1, df.admit)


admit,0,1
Prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,243,93
1.0,28,33


In [395]:
pd.crosstab(df.Prestige_2, df.admit)

admit,0,1
Prestige_2,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,176,73
1.0,95,53


> ### Question 7.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the most prestigious undergraduate schools.

In [398]:
#TODO
odds_prestige_1 = ((df.Prestige_1 == 1) & df.admit == 1).sum() 
odds_prestige_1

33

In [399]:
(df.Prestige_1 == 1).sum()

61

In [400]:
odds_p1 = odds_prestige_1 / 61.0
format(odds_p1,'2f')

'0.540984'

> ### Question 8.  Now calculate the odds of admission for undergraduates who did not attend a #1 ranked college.

In [401]:
# TODO
odds_not_prestige_1 = ((df.Prestige_1 != 1) & df.admit == 1).sum()
odds_not_prestige_1

93

In [402]:
(df.Prestige_1 != 1).sum()

336

In [403]:
odds_not_p1 = odds_not_prestige_1 / 336.0
format(odds_not_p1, '2f')

'0.276786'

> ### Question 9.  Finally, what's the odds ratio?

In [404]:
# TODO
odds_p1 / odds_not_p1 

1.9545214172395557

> ### Question 10.  Write this finding in a sentenance.

Answer: The odds of being admitted into UCLA Grad school are 1.95 times greater if you attend the most Prestigious schools as per ranking criteria.. 

> ### Question 11.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the least prestigious undergraduate schools.  Then calculate their odds ratio of being admitted to UCLA.  Finally, write this finding in a sentenance.

In [405]:
# TODO
pd.crosstab(df.Prestige_4, df.admit)

admit,0,1
Prestige_4,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,216,114
1.0,55,12


In [406]:
odds_prestige_4 = ((df.Prestige_4 == 1) & df.admit == 1).sum() 
odds_prestige_4


12

In [407]:
(df.Prestige_4 == 1).sum()

67

In [408]:
odds_p4 = odds_prestige_4 / 67.0
format(odds_p4,'2f')

'0.179104'

In [409]:
odds_not_prestige_4 = ((df.Prestige_4 != 1) & df.admit == 1).sum()
odds_not_prestige_4

114

In [410]:
(df.Prestige_4 != 1).sum()

330

In [411]:
#Prob Not P4 admission status 
odds_not_p4 = odds_not_prestige_4 / 333.0
format(odds_not_p4,'2f')

'0.342342'

In [412]:
#Odds Ratio
odds_p4 / odds_not_p4

0.52317360565593085

Answer: You are .52 less likely to be admitted in UCLA grad school when you attend the lowest ranking prestige school... 

## Part C. Analysis using `statsmodels`

> ### Question 12.  Fit a logistic regression model prediting admission into UCLA using `gre`, `gpa`, and the prestige of the undergraduate schools.  Use the highest prestige undergraduate schools as your reference point.

In [427]:
# TODO
data = df.drop('Prestige_1',axis = 1)

In [428]:
train_cols = data.columns[1:]

In [429]:
logit = smf.Logit(data['admit'],data[train_cols])
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.589121
         Iterations 5


> ### Question 13.  Print the model's summary results.

In [431]:
# TODO
result.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,397.0
Model:,Logit,Df Residuals:,392.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 20 Oct 2016",Pseudo R-squ.:,0.05722
Time:,16:08:40,Log-Likelihood:,-233.88
converged:,True,LL-Null:,-248.08
,,LLR p-value:,1.039e-05

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
gre,0.0014,0.001,1.308,0.191,-0.001 0.003
gpa,-0.1323,0.195,-0.680,0.497,-0.514 0.249
Prestige_2,-0.9562,0.302,-3.171,0.002,-1.547 -0.365
Prestige_3,-1.5375,0.332,-4.627,0.000,-2.189 -0.886
Prestige_4,-1.8699,0.401,-4.658,0.000,-2.657 -1.083


> ### Question 14.  What are the odds ratios of the different features and their 95% confidence intervals?

In [432]:
# TODO
result.conf_int()

Unnamed: 0,0,1
gre,-0.00068,0.003414
gpa,-0.513657,0.249045
Prestige_2,-1.547279,-0.365166
Prestige_3,-2.188769,-0.88623
Prestige_4,-2.656743,-1.083112


In [435]:
np.exp(result.params)

gre           1.001368
gpa           0.876073
Prestige_2    0.384342
Prestige_3    0.214918
Prestige_4    0.154135
dtype: float64

> ### Question 15.  Interpret the odds ratio for `prestige = 2`.

In [434]:
np.exp(result.params)

gre           1.001368
gpa           0.876073
Prestige_2    0.384342
Prestige_3    0.214918
Prestige_4    0.154135
dtype: float64

Answer: This indicates how a 1 unit increase or decrease in a variable affects the odds of being admitted..  Your odds decrease by 38% if you were to attend a prestige 2 level school from a prestige 1 level school 

> ### Question 16.  Interpret the odds ratio of `gpa`.

Answer: The odds ratio of gpa indicates that for every 1 unit increase in GPA your admission rate is either going to go up by 87% or down by 87%.. which makes sense implying that a student with a 3.0 gpa is much less likely to get accepted than a student 4.0 gpa... 

> ### Question 17.  Assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [440]:
# TODO
pd.crosstab(df.gre,df.gpa)
#Only 5 Students Accepted with 4.0 GPA and 800 GRE... need to locate their presitge rankings now 

gpa,2.26,2.42,2.48,2.52,2.55,...,3.95,3.97,3.98,3.99,4.0
gre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
220.0,0,0,0,0,0,...,0,0,0,0,0
300.0,0,0,0,0,0,...,0,0,0,0,0
340.0,0,0,0,0,0,...,0,0,0,0,0
360.0,0,0,0,0,0,...,0,0,0,0,0
380.0,0,0,0,0,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
720.0,0,0,0,0,0,...,0,0,0,0,1
740.0,0,0,0,0,0,...,0,0,0,0,2
760.0,0,0,0,0,0,...,0,0,0,0,1
780.0,0,0,0,0,0,...,0,0,0,0,1


Unnamed: 0,admit,gre,gpa,Prestige_1,Prestige_2,Prestige_3,Prestige_4
0,0,380.0,3.61,0.0,0.0,1.0,0.0
1,1,660.0,3.67,0.0,0.0,1.0,0.0
2,1,800.0,4.00,1.0,0.0,0.0,0.0
3,1,640.0,3.19,0.0,0.0,0.0,1.0
4,0,520.0,2.93,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
395,0,620.0,4.00,0.0,1.0,0.0,0.0
396,0,560.0,3.04,0.0,0.0,1.0,0.0
397,0,460.0,2.63,0.0,1.0,0.0,0.0
398,0,700.0,3.65,0.0,1.0,0.0,0.0


Answer:

## Part D. Moving the model from `statsmodels` to `sklearn`

> ### Question 18.  Let's assume we are satisfied with our model.  Remodel it (same features) using `sklearn`.  When creating the logistic regression model with `LogisticRegression(C = 10 ** 2)`.

In [None]:
# TODO

> ### Question 19.  What are the odds ratios for the different variables and how do they compare with the odds ratios calculated with `statsmodels`?

In [None]:
# TODO

Answer:

> ### Question 20.  Again assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [None]:
# TODO

Answer: