# Imports and Data Read In

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score


%matplotlib inline
pd.options.display.max_columns = 1_000
pd.options.display.max_rows = 1_000

In [2]:
df = pd.read_csv('./data/data.csv')

# Cleaning

In [3]:
df.drop(columns = ['Unnamed: 0', 'index'], inplace=True)

In [4]:
df['readmitted'].value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

# Variable Setup

In [5]:
# Defining my features as all my numeric columns
features = [col for col in df._get_numeric_data().columns if col != 'readmitted']

In [6]:
# Set up X and y variables
X = df[features]
y = df['readmitted']

# Model Prep - setting up my train/testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.33,
                                                    random_state=42)

In [21]:
# Baseline accuracy for Classification ONLY
max(y.mean(), 1 - y.mean())

0.5337110135210865

# Logistic Regression

> This is always a great place to start before building out other models. We use Logisitic Regression because although we want a binary output (0 or 1), Logistic Regression will bend the curve of our line of best fit so that values above .5 will be classified as 1 and values below will be classified as 0.

In [7]:
# Instantiate model
logreg = LogisticRegression('l1', C=5)
lasso = LassoCV(n_alphas=200)
ridge = RidgeCV(alphas=np.linspace(.1, 10, 100))

In [8]:
# Fit my model
logreg.fit(X, y)



LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
# Obtain predictions
preds = logreg.predict(X)
preds_prob = logreg.predict_proba(X)

In [10]:
# Taking a loot at my predictions
preds

array([0, 0, 0, ..., 0, 1, 0])

In [11]:
preds_prob

array([[0.52159844, 0.47840156],
       [0.61837924, 0.38162076],
       [0.65247254, 0.34752746],
       ...,
       [0.53753748, 0.46246252],
       [0.47115105, 0.52884895],
       [0.66175146, 0.33824854]])

In [12]:
logreg.intercept_

array([-0.92624255])

In [13]:
logreg.coef_

array([[-7.25089914e-02,  3.76782214e-02, -1.15307545e-02,
         2.89693473e-03,  1.99079434e-02,  1.29395774e-03,
        -5.32584173e-02,  3.54026093e-04,  8.13388531e-02,
         2.08819937e-01,  3.60634770e-01,  7.99704434e-02,
        -2.36387761e-02, -1.77121625e-02, -7.79620291e-02,
         5.71476914e-02,  1.85431297e-03, -1.73371827e-02,
         2.06404218e-02, -1.35049840e-02,  1.94471714e-02,
         3.10167736e-02,  1.87913115e-01, -5.35840640e-02,
         3.30270614e-02,  7.27056143e-02,  2.99271425e-01,
        -2.68525159e-01, -5.10002169e-01, -1.76619716e-02,
         1.61067068e-02, -1.33970760e-01, -2.61109132e-01,
        -2.45991698e-02, -6.46053747e-02, -1.00710244e-01,
        -2.23649333e-01, -5.09037944e-01, -7.21741531e-02,
        -2.00456053e-01, -6.59606286e-01, -1.44213469e-01,
        -1.59064023e-01, -2.05401075e-01, -1.36274393e-01,
        -1.25659636e-01, -1.16704409e-01, -1.20860008e-01,
        -2.68532143e-01, -2.52868789e-01, -8.42103179e-0

In [14]:
logreg.score(X_train, y_train)

0.622097178921755

In [15]:
logreg.score(X_test, y_test)

0.6154131159688774

In [16]:
r2_score(y, preds)

-0.5273775134446239

In [17]:
y.value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

In [18]:
y_train.value_counts(normalize=True)

0    0.534849
1    0.465151
Name: readmitted, dtype: float64

In [19]:
y_test.value_counts(normalize=True)

0    0.531401
1    0.468599
Name: readmitted, dtype: float64

# Cross Val Scores

In [20]:
logreg_cval = cross_val_score(logreg, X_train, y_train)
logreg_cval.mean()



0.6196334627936324