# Imports and Data Read In

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, RidgeCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn import svm

%matplotlib inline
pd.options.display.max_columns = 1_000
pd.options.display.max_rows = 1_000

In [5]:
df = pd.read_csv('./data/data.csv')

# Cleaning

In [6]:
df.drop(columns = ['Unnamed: 0', 'index'], inplace=True)

In [7]:
df['readmitted'].value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

# Variable Setup

In [8]:
# Defining my features as all my numeric columns
features = [col for col in df._get_numeric_data().columns if col != 'readmitted']

In [9]:
# Set up X and y variables
X = df[features]
y = df['readmitted']

# Model Prep - setting up my train/testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=42,
                                                    stratify=y)

In [10]:
y.value_counts(normalize=True)

0    0.533711
1    0.466289
Name: readmitted, dtype: float64

In [11]:
# Baseline accuracy for Classification ONLY
max(y.mean(), 1 - y.mean())

0.5337110135210865

In [12]:
rf = RandomForestClassifier(n_estimators=10)
et = ExtraTreesClassifier(n_estimators=10)

In [13]:
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.5968181605383569

In [14]:
cross_val_score(et, X_train, y_train, cv=5).mean()

0.5843601895034815

In [32]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [100],
    'max_depth': [None]
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.62802450853859


{'max_depth': None, 'n_estimators': 100}

In [33]:
gs.score(X_train, y_train)

1.0

In [34]:
gs.score(X_test, y_test)

0.6313580045647212

In [36]:
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [37]:
rf.score(X_train, y_train)

0.9836021030608502

In [38]:
rf.score(X_test, y_test)

0.6051108575154874

# Trying other stuff

In [22]:
bc = BaggingClassifier()
bc.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [23]:
cross_val_score(bc, X_train, y_train, cv=5).mean()

0.5976062187747669

In [24]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [25]:
cross_val_score(ada, X_train, y_train, cv=5).mean()

0.6302932788326856