# Imports and Data Read In

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, RidgeCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn import svm

%matplotlib inline
pd.options.display.max_columns = 1_000
pd.options.display.max_rows = 1_000

In [2]:
df = pd.read_csv('./data/data.csv')

# Cleaning

In [3]:
df.drop(columns = ['Unnamed: 0', 'index'], inplace=True)

In [4]:
df['readmitted'].value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

# Variable Setup

In [5]:
# Defining my features as all my numeric columns
features = ['number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

In [6]:
# Set up X and y variables
X = df[features]
y = df['readmitted']

# Model Prep - setting up my train/testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42,
                                                    stratify=y)

In [7]:
y.value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

In [8]:
# Baseline accuracy for Classification ONLY
max(y.mean(), 1 - y.mean())

0.5337110135210865

In [9]:
rf = RandomForestClassifier(n_estimators=10)
et = ExtraTreesClassifier(n_estimators=10)

In [10]:
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.6127270228560697

In [11]:
cross_val_score(et, X_train, y_train, cv=5).mean()

0.6124552800367873

In [12]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.6162049805045716


{'max_depth': 5, 'n_estimators': 10}

In [13]:
gs.score(X_train, y_train)

0.6171152200198351

In [14]:
gs.score(X_test, y_test)

0.6185197261167265

# Trying other stuff

In [15]:
bc = BaggingClassifier()
bc.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [16]:
cross_val_score(bc, X_train, y_train, cv=5).mean()

0.6132568504796468

In [17]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [18]:
cross_val_score(ada, X_train, y_train, cv=5).mean()

0.6149550732978138