### Multinomial Logistic Regression Model

In [4]:
# Dependencies
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import statsmodels.api as sm

In [5]:
# Import csv
csvpath = os.path.join("data", "clean_data.csv")
data_df = pd.read_csv(csvpath)
data_df["id"] = data_df.index + 1
del data_df["Unnamed: 0"]
data_df.head()

Unnamed: 0,General Health,(%) of Population Under FPL,Eaten Fruits or Veggies Yesterday,Age Group,Race,Education,Insured,Exercise,Sex,Drinker,Smoker,id
0,4.0,2.0,2.0,4.0,2,3.0,1.0,1.0,0,0.0,0.0,1
1,4.0,3.0,2.0,2.0,4,2.0,1.0,1.0,0,0.0,0.0,2
2,4.0,2.0,1.0,5.0,1,4.0,1.0,1.0,1,1.0,1.0,3
3,2.0,1.0,2.0,5.0,1,2.0,1.0,1.0,0,0.0,0.0,4
4,3.0,1.0,2.0,4.0,1,4.0,1.0,1.0,0,1.0,1.0,5


In [6]:
# Modify DV for MNLM (Need to change base outcome to mid value)
# Ordinal relationship of values in this variable is not relevent in MNLM 
data_df["Unaltered Health"] = data_df["General Health"]
name_health = {"General Health": {5: "excellent", 4: "very good", 3: "good", 2: "fair", 1: "poor"}}
data_df.replace(name_health, inplace=True)
reposition_health = {"General Health": {"excellent" : 5, "very good" : 4, "fair" : 3, "poor" : 2, "good" : 1}}
data_df.replace(reposition_health, inplace=True)
data_df.head()

Unnamed: 0,General Health,(%) of Population Under FPL,Eaten Fruits or Veggies Yesterday,Age Group,Race,Education,Insured,Exercise,Sex,Drinker,Smoker,id,Unaltered Health
0,4,2.0,2.0,4.0,2,3.0,1.0,1.0,0,0.0,0.0,1,4.0
1,4,3.0,2.0,2.0,4,2.0,1.0,1.0,0,0.0,0.0,2,4.0
2,4,2.0,1.0,5.0,1,4.0,1.0,1.0,1,1.0,1.0,3,4.0
3,3,1.0,2.0,5.0,1,2.0,1.0,1.0,0,0.0,0.0,4,2.0
4,1,1.0,2.0,4.0,1,4.0,1.0,1.0,0,1.0,1.0,5,3.0


In [60]:
# Variable groups
y = data_df['General Health']
lifestyle = data_df.drop(['id', 'Unaltered Health', 'General Health', 'Sex', '(%) of Population Under FPL', 'Insured', 'Education', 'Race', 'Age Group'], axis = 1)
insurance = data_df.drop(['id', 'Unaltered Health', 'General Health', 'Smoker', 'Sex', '(%) of Population Under FPL', 'Education', 'Race', 'Age Group', 'Eaten Fruits or Veggies Yesterday', "Exercise", "Drinker"], axis = 1)
demo = data_df.drop(['id', 'Unaltered Health', 'General Health', 'Smoker', 'Sex', 'Insured', 'Eaten Fruits or Veggies Yesterday', "Exercise", "Drinker"], axis = 1)

In [61]:
# Model uses "good health" as the base outcome. Model variables can be added by removing them from the "X" list.
#Mlogit 1
Xc = sm.add_constant(lifestyle)
mlogit = sm.MNLogit(y, Xc)
fmlogit = mlogit.fit()


Optimization terminated successfully.
         Current function value: 1.459542
         Iterations 7


In [62]:
print(fmlogit.summary())
print(np.exp(fmlogit.params))



                          MNLogit Regression Results                          
Dep. Variable:         General Health   No. Observations:                 7913
Model:                        MNLogit   Df Residuals:                     7897
Method:                           MLE   Df Model:                           12
Date:                Sun, 10 Jun 2018   Pseudo R-squ.:                 0.03381
Time:                        18:03:52   Log-Likelihood:                -11549.
converged:                       True   LL-Null:                       -11954.
                                        LLR p-value:                2.696e-165
                 General Health=2       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                                -0.1552      0.191     -0.812      0.417      -0.530       0.220
Eaten Fruits or Veggies Yesterday    -0.1895      0.098     -1

  bse = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [63]:
rel_odds = (np.exp(fmlogit.params))

In [64]:
print(type(rel_odds))

<class 'pandas.core.frame.DataFrame'>


In [65]:
rel_odds.columns = ["Poor Health", "Fair Health", "Very Good Health", "Excellent Health"]
rel_odds

Unnamed: 0,Poor Health,Fair Health,Very Good Health,Excellent Health
const,0.856256,0.797533,0.189542,0.103695
Eaten Fruits or Veggies Yesterday,0.827411,0.980071,1.451692,1.675784
Exercise,0.411967,0.71457,1.584836,1.654824
Drinker,0.579252,0.822654,1.287399,1.224413
Smoker,0.579252,0.822654,1.287399,1.224413


In [57]:
# This cell needs to iterate over every row /column and apply the conditional statement in the loop below (The code is wrong but it is conceptually correct)
def rel_prob(x):
    if type(x) is str:
        print("skipping string")
    elif x < 1:
        x = (-1(1-x))
    else:
        x = (x-1)
    

In [None]:
# Option 1 for appying changes
rel_odds.applymap(rel_prob)

In [None]:
# Change values to percents
rel_odds.pct_change()

In [None]:
# Option 2 for applying changes
for i, row in rel_odds.iterrows():
    j = 0
    while j < 3:
        x = rel_odds.iloc[i,j]
        rel_prob(x)
    rel_odds.iloc[i,j] = x
    j = j + 1
rel_odds.pct_change()   

In [69]:
# Plot relative percentages with y range from -1 to 1, possibly -2 to 2 depending on data
# Perhaps bar chart with positive bars green and negative bars red
# One plot for each category