# Logistic Regression

HW2 reference

In [2]:
#Import some example data

import pandas as pd
# target = InMichelin, whether or not a restaurant is in the Michelin guide
# do not forget the encoding 
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv" , encoding="latin_1")
data.head()

#update data to set up for train test split
data = data.loc[:, data.columns != 'Restaurant Name']
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

In [3]:
#Set up training and test data
from sklearn.model_selection import train_test_split

# random_state like set.seed in R
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42) 

#Note: random_state ensures same data will be generated for example each time
from sklearn.linear_model import LogisticRegression

#Note: logistic regression in sklearn is preset to be a regularization model with C=100).
#If you make C really high the model effectively becomes a logistic regression model...

# C is the tuning parameter like alpha in Reidge and Lasso regression
# Reguilization

# set C high to get normal logistic regression
logreg = LogisticRegression(C=1e90).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))



logreg .coef_: [[ 0.3810703   0.07415961 -0.1569253   0.0819146 ]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


  from collections import Sequence


In [4]:
logreg

#Use ?LogisticRegression() for more information

LogisticRegression(C=1e+90, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Logistic Regression in statsmodels package

In [5]:
# the same using different package like lm() in R
import statsmodels.api as sm

# add constant to matrix
X_train_new = sm.add_constant(X_train)

# Generized linear model using binomial model,
model = sm.GLM(y_train, X_train_new, family=sm.families.Binomial()).fit()

model.summary()


0,1,2,3
Dep. Variable:,InMichelin,No. Observations:,123
Model:,GLM,Df Residuals:,118
Model Family:,Binomial,Df Model:,4
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-57.266
Date:,"Thu, 04 Oct 2018",Deviance:,114.53
Time:,17:00:58,Pearson chi2:,254.
No. Iterations:,6,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.6490,2.588,-4.115,0.000,-15.722,-5.576
Food,0.3818,0.148,2.572,0.010,0.091,0.673
Decor,0.0743,0.103,0.720,0.471,-0.128,0.277
Service,-0.1569,0.147,-1.070,0.285,-0.444,0.131
Price,0.0819,0.036,2.269,0.023,0.011,0.153


## Logistic Regression with constraints on size of coefficients

In [6]:
# Smaller C will constrain Betas more.  It's a tuning parameter we can find using gridsearch.
# Smaller C = Stronger constraints

#C=100, compare coefs to regular model above.
logreg = LogisticRegression(C=100).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.35927528  0.06667776 -0.16192467  0.08317683]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [7]:

#C=1, compare coefs to above models.
logreg = LogisticRegression(C=1).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.08104081 -0.04500912 -0.25388726  0.11700635]]
Training set score: 0.756
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 0]


In [8]:

#C=.01, compare coefs to above models.

#Does the model's prediction power get better or worse??

logreg = LogisticRegression(C=1).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.08104081 -0.04500912 -0.25388726  0.11700635]]
Training set score: 0.756
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 0]


## Multiclass models (Multinomial model)

In [9]:
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
iris
X, y = iris.data, iris.target

print(iris.feature_names )# X variable names
print(X[0:5]) # first five rows of data

print(iris.target_names) #target categories
print(np.unique(y)) #target values



['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
['setosa' 'versicolor' 'virginica']
[0 1 2]


In [10]:
#Note the argument changes to LogisticRegression()
# change from binomial to multinomial 
logreg = LogisticRegression(multi_class="multinomial",solver="lbfgs").fit(X,y) 

In [11]:
print(logreg.predict(X)) #uses softmax function to predict new X data, but I am being lazy and using X data here.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
