# Imports and Data Read In

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score


%matplotlib inline
pd.options.display.max_columns = 1_000
pd.options.display.max_rows = 1_000

In [2]:
df = pd.read_csv('./data/data.csv')

# Cleaning

In [3]:
df.drop(columns = ['Unnamed: 0', 'index'], inplace=True)

In [4]:
df.drop(columns = ['metformin', 'repaglinide', 'nateglinide',
       'glimepiride', 'glipizide',
       'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'insulin', 'glyburide-metformin'], inplace=True)

In [5]:
df['readmitted'].value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

# Variable Setup

In [6]:
# Defining my features as all my numeric columns
features = [col for col in df._get_numeric_data().columns if col != 'readmitted']

In [7]:
# Set up X and y variables
X = df[features]
y = df['readmitted']

# Model Prep - setting up my train/testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.33,
                                                    random_state=42)

# Logistic Regression

> This is always a great place to start before building out other models. We use Logisitic Regression because although we want a binary output (0 or 1), Logistic Regression will bend the curve of our line of best fit so that values above .5 will be classified as 1 and values below will be classified as 0.

In [8]:
# Instantiate model
logreg = LogisticRegression('l1', C=5)
lasso = LassoCV(n_alphas=200)
ridge = RidgeCV(alphas=np.linspace(.1, 10, 100))

In [9]:
# Fit my model
logreg.fit(X, y)



LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
# Obtain predictions
preds = logreg.predict(X)
preds_prob = logreg.predict_proba(X)

In [11]:
# Taking a look at my predictions
preds

array([0, 0, 0, ..., 0, 1, 0])

In [12]:
preds_prob

array([[0.51433124, 0.48566876],
       [0.64371731, 0.35628269],
       [0.64445236, 0.35554764],
       ...,
       [0.51453003, 0.48546997],
       [0.4810786 , 0.5189214 ],
       [0.66265653, 0.33734347]])

In [13]:
logreg.intercept_

array([-1.11073103])

In [14]:
logreg.coef_

array([[-7.05818818e-02,  3.84013557e-02, -1.12824195e-02,
         3.21863676e-03,  2.00872961e-02,  1.40036567e-03,
        -5.18350122e-02, -3.27696567e-04,  8.23921663e-02,
         2.09152399e-01,  3.62175704e-01,  8.08314318e-02,
        -2.19243582e-02, -2.15003161e-02,  2.47601343e-02,
         2.31736514e-01, -2.42448478e-01, -4.81855724e-01,
         8.61286189e-03,  5.50881860e-02, -1.04520867e-01,
        -2.34381813e-01,  1.10204600e-02, -3.61399283e-02,
        -7.33053810e-02, -1.89604403e-01, -4.74951660e-01,
        -4.52848056e-02, -1.67132935e-01, -6.46838598e-01,
        -1.10398464e-01, -1.27283877e-01, -1.72929384e-01,
        -7.32214041e-02, -6.20976963e-02, -5.65283175e-02,
        -5.63433895e-02, -2.11377302e-01, -1.96009812e-01,
        -1.61860734e-02, -8.87606907e-02, -1.13833195e-01,
        -8.85992233e-02, -1.06374984e+00,  1.15116855e-01,
        -9.80953971e-02, -5.54022316e-01, -1.94526454e-01,
        -2.54840486e-01, -1.21476108e-01, -1.39496199e-0

In [15]:
logreg.score(X_train, y_train)

0.6214584442247738

In [16]:
logreg.score(X_test, y_test)

0.6159688773619859

In [17]:
r2_score(y, preds)

-0.5283601447359434

In [18]:
y.value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

In [19]:
y_train.value_counts(normalize=True)

0    0.534849
1    0.465151
Name: readmitted, dtype: float64

In [20]:
y_test.value_counts(normalize=True)

0    0.531401
1    0.468599
Name: readmitted, dtype: float64

# Cross Val Scores

In [21]:
logreg_cval = cross_val_score(logreg, X_train, y_train)
logreg_cval.mean()



0.6194509838179129