# Logistic Regression

In [1]:
#Import some example data

import pandas as pd
# target = InMichelin, whether or not a restaurant is in the Michelin guide
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv", encoding="latin_1")
print(data.head())

#update data to set up for train test split
data = data.loc[:, data.columns != 'Restaurant Name']
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

   InMichelin Restaurant Name  Food  Decor  Service  Price
0           0  14 Wall Street    19     20       19     50
1           0             212    17     17       16     43
2           0        26 Seats    23     17       21     35
3           1              44    19     23       16     52
4           0               A    23     12       19     24


In [3]:
#Set up training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42) 

#Note: random_state ensures same data will be generated for example each time
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(penalty='none').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.38181614  0.07433425 -0.15691054  0.08189853]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [6]:
logreg

#Use ?LogisticRegression() for more information

LogisticRegression(penalty='none')

## Logistic Regression in statsmodels package

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import statsmodels.api as sm

X_train_new = sm.add_constant(X_train)

model = sm.GLM(y_train, X_train_new, family=sm.families.Binomial()).fit()

model.summary()

0,1,2,3
Dep. Variable:,InMichelin,No. Observations:,123.0
Model:,GLM,Df Residuals:,118.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-57.266
Date:,"Sun, 19 Jun 2022",Deviance:,114.53
Time:,10:24:50,Pearson chi2:,254.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.6490,2.588,-4.115,0.000,-15.722,-5.576
Food,0.3818,0.148,2.572,0.010,0.091,0.673
Decor,0.0743,0.103,0.720,0.471,-0.128,0.277
Service,-0.1569,0.147,-1.070,0.285,-0.444,0.131
Price,0.0819,0.036,2.269,0.023,0.011,0.153


## Logistic Regression with constraints on size of coefficients

In [13]:
# Smaller C will constrain Betas more.  It's a tuning parameter we can find using gridsearch.

#C=100, compare coefs to regular model above.
logreg = LogisticRegression(C=100, penalty='l2').fit(X_train, y_train) 

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))
print(f"logreg.predict prob {logreg.predict_proba(X_test[0:10])}")

logreg .coef_: [[ 0.38171368  0.07433904 -0.15682846  0.08189077]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]
logreg.predict prob [[0.85254277 0.14745723]
 [0.84461604 0.15538396]
 [0.67682771 0.32317229]
 [0.13745703 0.86254297]
 [0.86696123 0.13303877]
 [0.86173258 0.13826742]
 [0.83064887 0.16935113]
 [0.89750002 0.10249998]
 [0.53908214 0.46091786]
 [0.79532536 0.20467464]]


In [None]:

#C=1, compare coefs to above models.
logreg = LogisticRegression(C=1, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.37187726  0.07490079 -0.14897911  0.08113593]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [None]:

#C=.0001, compare coefs to above models.

#Does the model's prediction power get better or worse??

logreg = LogisticRegression(C=.0001, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[0.00549429 0.00672568 0.00502413 0.02866617]]
Training set score: 0.699
Test set score: 0.732
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 0 0]


In [16]:
#What if we want to use an l1 penalty instead?  Change penalty to 'l1' and solver to 'liblinear'.

#Does the model's prediction power get better or worse??

logreg = LogisticRegression(C=.01, penalty='l1',solver='liblinear').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[-0.02290056  0.          0.          0.00967782]]
Training set score: 0.699
Test set score: 0.732
logreg.predict: [0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0
 1 1 1 1]


## Multiclass models (Multinomial model)

In [18]:
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
iris
X, y = iris.data, iris.target

print(iris.feature_names )# X variable names
print(X[0:5]) # first five rows of data

print(iris.target_names) #target categories
print(np.unique(y)) #target values



['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
['setosa' 'versicolor' 'virginica']
[0 1 2]


In [19]:
logreg = LogisticRegression(multi_class="multinomial",solver="lbfgs",max_iter=10000,).fit(X,y) #Note the argument changes to LogisticRegression()

In [20]:
print(logreg.predict(X)) #uses softmax function to predict new X data, but I am being lazy and using X data here.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
