In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# machine learning
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from matplotlib import pyplot

from pprint import pprint


In [87]:
import plotly.graph_objects as go
import plotly.express as px

import chart_studio as cs
import chart_studio.plotly as py
import chart_studio.tools as tls

In [124]:
username = "vivekmehendiratta"
api_key = "nQrS2n3v7shKKPTKws4O"

tls.set_credentials_file(username=username, api_key=api_key)

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
dictionary = pd.read_csv('train_data_dictionary.csv')
sample = pd.read_csv('sample_sub.csv')

train["Stay"] = train["Stay"].replace("More than 100 Days", "Above 100")

train.drop(["case_id", "patientid", "City_Code_Patient"], axis = 1, inplace = True)
test.drop(["case_id", "patientid", "City_Code_Patient"], axis = 1, inplace = True)

train.shape, test.shape

((318438, 15), (137057, 14))

In [3]:
combined = train.append(test)

mode = combined["Bed Grade"].value_counts().idxmax()
combined["Bed Grade"] = combined["Bed Grade"].fillna(mode)
train = combined[combined["Stay"].notnull()]
test = combined[combined["Stay"].isnull()].drop("Stay", axis = 1)

In [4]:
test.isnull().sum()

Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
dtype: int64

In [9]:
len(set([1,2,3,4]).intersection(set([1,3,4,5])))

3

In [13]:
# Check for unique values in every column
for variable in train.columns:
    print(f"Unique Values for {variable}")
    print(train[variable].unique())
    print('--------------------------------------')
    print()

Unique Values for Hospital_code
[ 8  2 10 26 23 32  1 22 16  9  6 29 12  3 21 28 27 19  5 14 13 31 24 17
 25 15 11 30 18  4  7 20]
--------------------------------------

Unique Values for Hospital_type_code
['c' 'e' 'b' 'a' 'f' 'd' 'g']
--------------------------------------

Unique Values for City_Code_Hospital
[ 3  5  1  2  6  9 10  4 11  7 13]
--------------------------------------

Unique Values for Hospital_region_code
['Z' 'X' 'Y']
--------------------------------------

Unique Values for Available Extra Rooms in Hospital
[ 3  2  1  4  6  5  7  8  9 10 12  0 11 20 14 21 13 24]
--------------------------------------

Unique Values for Department
['radiotherapy' 'anesthesia' 'gynecology' 'TB & Chest disease' 'surgery']
--------------------------------------

Unique Values for Ward_Type
['R' 'S' 'Q' 'P' 'T' 'U']
--------------------------------------

Unique Values for Ward_Facility_Code
['F' 'E' 'D' 'B' 'A' 'C']
--------------------------------------

Unique Values for Bed Grade
[

In [53]:
X_train = train.drop(['Stay'], axis=1)
y_train = train["Stay"]
X_test  = test.copy()
X_train.shape, y_train.shape, X_test.shape

((318438, 14), (318438,), (137057, 14))

In [54]:
cat_columns = [
    'Hospital_code',
'Hospital_type_code',
'City_Code_Hospital',
'Hospital_region_code',
'Department',
'Ward_Type',
'Ward_Facility_Code',
'Bed Grade',
'Type of Admission',
'Severity of Illness',
'Age'
]

num_columns = [ 'Admission_Deposit', 'Available Extra Rooms in Hospital', 'Visitors with Patient']

In [55]:
# One-hot encoding using pandas get_dummies function
X_train_cat = X_train[cat_columns]
X_train_cat = pd.get_dummies(X_train_cat.astype(str), drop_first=True)

X_train_cat_nb = X_train[cat_columns]
X_train_cat_nb = pd.get_dummies(X_train_cat_nb.astype(str), drop_first=False)

X_test_cat = X_test[cat_columns]
X_test_cat = pd.get_dummies(X_test_cat.astype(str), drop_first=True)

X_test_cat_nb = X_test[cat_columns]
X_test_cat_nb = pd.get_dummies(X_test_cat_nb.astype(str), drop_first=False)

X_train = pd.concat([X_train_cat, X_train[num_columns]], axis = 1)
X_test = pd.concat([X_test_cat, X_test[num_columns]], axis = 1)

X_train_nb = pd.concat([X_train_cat_nb, train[num_columns]], axis = 1)
X_test_nb = pd.concat([X_test_cat_nb, test[num_columns]], axis = 1)

In [65]:
X_train_nb = pd.concat([X_train_cat_nb, train[num_columns]], axis = 1)
X_test_nb = pd.concat([X_test_cat_nb, test[num_columns]], axis = 1)

In [56]:
X_train.shape, X_test.shape, X_train_nb.shape, X_test_nb.shape

((318438, 82), (137057, 82), (318438, 93), (137057, 93))

In [57]:
# Scaling numeric columns
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()

X_train[num_columns]= ss.fit_transform(X_train[num_columns].values)
X_test[num_columns]= ss.fit_transform(X_test[num_columns].values)

In [58]:
X_train.shape, X_test.shape, X_train_nb.shape, X_test_nb.shape

((318438, 82), (137057, 82), (318438, 93), (137057, 93))

In [34]:
# Logistic Regression Classifier
log_reg = LogisticRegression()

log_reg = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)

scores = cross_val_score(log_reg, X_train, y_train, cv=5)

In [35]:
log_reg_cv_score = scores.mean()
log_reg_cv_score

0.39803348294925767

## Exhibit 9

In [None]:
# Exhibit 9
# get importance
feature_importance = log_reg.coef_[0]

#summarize important features
print("Top 5 Important Features for Logistic Regression\n")
print("Attribute".center(35),"Importance".center(35), sep = "\t")
print("--------------------".center(35),"--------------------".center(35), sep = "\t")
for importance, name in sorted(zip(abs(log_reg.coef_[0]), X_train.columns),reverse=True)[:5]:
    print (name.center(35), format(importance,'.2f').center(35), sep = "\t")
# plot feature importance
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': log_reg.coef_[0]
})
importances = importances.sort_values(by='Importance', ascending=False)

fig9 = px.bar(importances, x='Attribute', y='Importance', color='#087E8B', labels={
    'Attribute':'Coefficients',
    'Importance':'Importance Score'
})

fig9.update_layout(title="Logistic Regression Feature importances (coefficients)",title_x=0.5)
fig9.show()

py.plot(fig9, filename = "Logistic Regression Feature importances (coefficients)",  auto_open = False)

In [36]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)

scores = cross_val_score(knn, X_train, y_train, cv=5)

knn_cv_score = scores.mean()
knn_cv_score

0.30098477604307466

In [37]:
# Decision Tree

#define model
decision_tree = DecisionTreeClassifier()

scores = cross_val_score(decision_tree, X_train, y_train, cv = 5)

train_dt_cv_acc = scores.mean()
train_dt_cv_acc

0.2873369130735742

## Exhibit 10

In [None]:
# get importance
feature_importance = decision_tree.feature_importances_

# summarize feature importance
print("Top 5 Important Features for Decision Tree\n")
print("Attribute".center(35),"Importance".center(35), sep = "\t")
print("--------------------".center(35),"--------------------".center(35), sep = "\t")
for importance, name in sorted(zip(decision_tree.feature_importances_, X_train.columns),reverse=True)[:5]:
    print (name.center(35), format(importance,'.2f').center(35), sep = "\t")
    
# plot feature importance
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': decision_tree.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False) 

fig10 = px.bar(importances, x='Attribute', y='Importance', color='#087E8B', labels={
    'Attribute':'Coefficients',
    'Importance':'Importance Score'
})

fig10.update_layout(title="Decision Tree Feature importances (coefficients)",title_x=0.5)
fig10.show()

py.plot(fig10, filename = "Decision Tree Feature importances (coefficients)",  auto_open = False)

In [38]:
# Random Forest

#define model
random_forest = RandomForestClassifier()

scores = cross_val_score(random_forest, X_train, y_train, cv = 5)

rf_cv_acc = scores.mean()
rf_cv_acc

0.33487522214627863

## Exhibit 11

In [None]:
# get importance
feature_importance = random_forest.feature_importances_

# summarize feature importance
print("Top 5 Important Features for Random Forest\n")
print("Attribute".center(35),"Importance".center(35), sep = "\t")
print("--------------------".center(35),"--------------------".center(35), sep = "\t")
for importance, name in sorted(zip(random_forest.feature_importances_, X_train.columns),reverse=True)[:5]:
    print (name.center(35), format(importance,'.2f').center(35), sep = "\t")
    
# plot feature importance
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': random_forest.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False) 
fig11 = px.bar(importances, x='Attribute', y='Importance', color='#087E8B', labels={
    'Attribute':'Coefficients',
    'Importance':'Importance Score'
})

fig11.update_layout(title="Random Forest Feature importances (coefficients)",title_x=0.5)
fig11.show()

py.plot(fig11, filename = "Random Forest Feature importances (coefficients)",  auto_open = False)

In [66]:
# Naive Bayes
nb = MultinomialNB()

scores = cross_val_score(nb, X_train_nb, y_train, cv = 5)

nb_cv_acc = scores.mean()
nb_cv_acc

0.3254228142775269

In [69]:
baseline_acc = train["Stay"].value_counts(normalize = True).max()*100
baseline_acc

27.475050088243236

In [71]:
classifier_scores = {
    "Baseline Accuracy" : baseline_acc,
    "Logistic Regression" : log_reg_cv_score,
    "Decision Tree" : train_dt_cv_acc,
    "Random Forest Classifier" : rf_cv_acc,
    "Naive Bayes" : nb_cv_acc
}
classifier_scores

{'Baseline Accuracy': 27.475050088243236,
 'Logistic Regression': 0.39803348294925767,
 'Decision Tree': 0.2873369130735742,
 'Random Forest Classifier': 0.33487522214627863,
 'Naive Bayes': 0.3254228142775269}

In [81]:
# Hyper parameter Tuning Random Forest

# Number of trees in random forest
n_estimators = [200, 500, 1000]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [2,5,10,20]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True],
 'max_depth': [2, 5, 10, 20, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 500, 1000]}


In [77]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, verbose=3, random_state=42)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END bootstrap=True, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.290 total time=  11.5s
[CV 2/3] END bootstrap=True, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.293 total time=  11.5s
[CV 3/3] END bootstrap=True, max_depth=2, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.290 total time=  11.6s
[CV 1/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1000;, score=0.398 total time= 3.7min
[CV 2/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1000;, score=0.399 total time= 3.8min
[CV 3/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1000;, score=0.386 total time= 3.9min
[CV 1/3] END boots

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(),
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [2, 5, 10, 20, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 500, 1000]},
                   random_state=42, verbose=3)

In [78]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [80]:
# Random Forest

#define model
random_forest = RandomForestClassifier(
    n_estimators=1000, 
    min_samples_split=2, 
    min_samples_leaf=4, 
    max_features='sqrt',
    max_depth=20, 
    bootstrap=True)

scores = cross_val_score(random_forest, X_train, y_train, cv = 3)

rf_best_cv_acc = scores.mean()
rf_best_cv_acc

0.4087577487611403

## Exhibit 11

In [126]:
# random_forest.fit(X_train, y_train)
# get importance
feature_importance = random_forest.feature_importances_

# summarize feature importance
print("Top 10 Important Features for Random Forest\n")
print("Attribute".center(35),"Importance".center(35), sep = "\t")
print("--------------------".center(35),"--------------------".center(35), sep = "\t")
for importance, name in sorted(zip(random_forest.feature_importances_, X_train.columns),reverse=True)[:10]:
    print (name.center(35), format(importance,'.2f').center(35), sep = "\t")
    
# plot feature importance
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': random_forest.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False) 
fig11 = px.bar(importances, x='Attribute', y='Importance', labels={
    'Attribute':'Coefficients',
    'Importance':'Importance Score'
})

fig11.update_layout(title="Random Forest Feature importances (coefficients)",title_x=0.5)
fig11.show()

py.plot(fig11, filename = "Random Forest Feature importances (coefficients)",  auto_open = False)

Top 10 Important Features for Random Forest

             Attribute             	             Importance            
        --------------------       	        --------------------       
       Visitors with Patient       	                0.31               
         Admission_Deposit         	                0.15               
 Available Extra Rooms in Hospital 	                0.06               
            Ward_Type_S            	                0.03               
            Ward_Type_Q            	                0.03               
      Type of Admission_Trauma     	                0.03               
           Bed Grade_2.0           	                0.03               
     Severity of Illness_Minor     	                0.02               
            Ward_Type_R            	                0.02               
           Bed Grade_3.0           	                0.02               


'https://plotly.com/~vivekmehendiratta/30/'

In [97]:
# Hyper parameter Tuning Logistic

penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
max_iter = [100, 200, 500, 1000]

# Create the random grid
grid = {'penalty': penalty,
               'C': c_values,
       'max_iter':max_iter}

pprint(grid)

{'C': [100, 10, 1.0, 0.1, 0.01],
 'max_iter': [100, 200, 500, 1000],
 'penalty': ['l2']}


In [98]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
lr = LogisticRegression()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
log_reg_grid = GridSearchCV(estimator = lr, param_grid = grid, cv = 3, verbose=10, scoring='accuracy', error_score=0)

# Fit the random search model
log_reg_grid.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3; 1/20] START C=100, max_iter=100, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 1/20] END C=100, max_iter=100, penalty=l2;, score=0.400 total time=  19.5s
[CV 2/3; 1/20] START C=100, max_iter=100, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 1/20] END C=100, max_iter=100, penalty=l2;, score=0.404 total time=  20.1s
[CV 3/3; 1/20] START C=100, max_iter=100, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 1/20] END C=100, max_iter=100, penalty=l2;, score=0.388 total time=  21.6s
[CV 1/3; 2/20] START C=100, max_iter=200, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 2/20] END C=100, max_iter=200, penalty=l2;, score=0.400 total time=  40.0s
[CV 2/3; 2/20] START C=100, max_iter=200, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 2/20] END C=100, max_iter=200, penalty=l2;, score=0.403 total time=  39.4s
[CV 3/3; 2/20] START C=100, max_iter=200, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 2/20] END C=100, max_iter=200, penalty=l2;, score=0.387 total time=  38.7s
[CV 1/3; 3/20] START C=100, max_iter=500, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 3/20] END C=100, max_iter=500, penalty=l2;, score=0.400 total time= 1.6min
[CV 2/3; 3/20] START C=100, max_iter=500, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 3/20] END C=100, max_iter=500, penalty=l2;, score=0.404 total time= 1.6min
[CV 3/3; 3/20] START C=100, max_iter=500, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 3/20] END C=100, max_iter=500, penalty=l2;, score=0.387 total time= 1.6min
[CV 1/3; 4/20] START C=100, max_iter=1000, penalty=l2...........................
[CV 1/3; 4/20] END C=100, max_iter=1000, penalty=l2;, score=0.400 total time= 1.7min
[CV 2/3; 4/20] START C=100, max_iter=1000, penalty=l2...........................
[CV 2/3; 4/20] END C=100, max_iter=1000, penalty=l2;, score=0.404 total time= 1.9min
[CV 3/3; 4/20] START C=100, max_iter=1000, penalty=l2...........................
[CV 3/3; 4/20] END C=100, max_iter=1000, penalty=l2;, score=0.387 total time= 2.0min
[CV 1/3; 5/20] START C=10, max_iter=100, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 5/20] END C=10, max_iter=100, penalty=l2;, score=0.400 total time=  18.2s
[CV 2/3; 5/20] START C=10, max_iter=100, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 5/20] END C=10, max_iter=100, penalty=l2;, score=0.404 total time=  19.2s
[CV 3/3; 5/20] START C=10, max_iter=100, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 5/20] END C=10, max_iter=100, penalty=l2;, score=0.388 total time=  18.7s
[CV 1/3; 6/20] START C=10, max_iter=200, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 6/20] END C=10, max_iter=200, penalty=l2;, score=0.400 total time=  35.9s
[CV 2/3; 6/20] START C=10, max_iter=200, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 6/20] END C=10, max_iter=200, penalty=l2;, score=0.403 total time=  36.3s
[CV 3/3; 6/20] START C=10, max_iter=200, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 6/20] END C=10, max_iter=200, penalty=l2;, score=0.387 total time=  36.1s
[CV 1/3; 7/20] START C=10, max_iter=500, penalty=l2.............................
[CV 1/3; 7/20] END C=10, max_iter=500, penalty=l2;, score=0.400 total time= 1.5min
[CV 2/3; 7/20] START C=10, max_iter=500, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 7/20] END C=10, max_iter=500, penalty=l2;, score=0.404 total time= 1.5min
[CV 3/3; 7/20] START C=10, max_iter=500, penalty=l2.............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 7/20] END C=10, max_iter=500, penalty=l2;, score=0.387 total time= 1.5min
[CV 1/3; 8/20] START C=10, max_iter=1000, penalty=l2............................
[CV 1/3; 8/20] END C=10, max_iter=1000, penalty=l2;, score=0.400 total time= 1.5min
[CV 2/3; 8/20] START C=10, max_iter=1000, penalty=l2............................
[CV 2/3; 8/20] END C=10, max_iter=1000, penalty=l2;, score=0.403 total time= 2.1min
[CV 3/3; 8/20] START C=10, max_iter=1000, penalty=l2............................
[CV 3/3; 8/20] END C=10, max_iter=1000, penalty=l2;, score=0.387 total time= 2.0min
[CV 1/3; 9/20] START C=1.0, max_iter=100, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 9/20] END C=1.0, max_iter=100, penalty=l2;, score=0.400 total time=  18.8s
[CV 2/3; 9/20] START C=1.0, max_iter=100, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 9/20] END C=1.0, max_iter=100, penalty=l2;, score=0.403 total time=  19.7s
[CV 3/3; 9/20] START C=1.0, max_iter=100, penalty=l2............................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 9/20] END C=1.0, max_iter=100, penalty=l2;, score=0.387 total time=  18.9s
[CV 1/3; 10/20] START C=1.0, max_iter=200, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 10/20] END C=1.0, max_iter=200, penalty=l2;, score=0.400 total time=  37.8s
[CV 2/3; 10/20] START C=1.0, max_iter=200, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 10/20] END C=1.0, max_iter=200, penalty=l2;, score=0.403 total time=  37.6s
[CV 3/3; 10/20] START C=1.0, max_iter=200, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 10/20] END C=1.0, max_iter=200, penalty=l2;, score=0.387 total time=  37.5s
[CV 1/3; 11/20] START C=1.0, max_iter=500, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 11/20] END C=1.0, max_iter=500, penalty=l2;, score=0.400 total time= 1.6min
[CV 2/3; 11/20] START C=1.0, max_iter=500, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 11/20] END C=1.0, max_iter=500, penalty=l2;, score=0.404 total time= 1.6min
[CV 3/3; 11/20] START C=1.0, max_iter=500, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 11/20] END C=1.0, max_iter=500, penalty=l2;, score=0.387 total time= 1.5min
[CV 1/3; 12/20] START C=1.0, max_iter=1000, penalty=l2..........................
[CV 1/3; 12/20] END C=1.0, max_iter=1000, penalty=l2;, score=0.400 total time= 2.2min
[CV 2/3; 12/20] START C=1.0, max_iter=1000, penalty=l2..........................
[CV 2/3; 12/20] END C=1.0, max_iter=1000, penalty=l2;, score=0.403 total time= 2.3min
[CV 3/3; 12/20] START C=1.0, max_iter=1000, penalty=l2..........................
[CV 3/3; 12/20] END C=1.0, max_iter=1000, penalty=l2;, score=0.387 total time= 2.4min
[CV 1/3; 13/20] START C=0.1, max_iter=100, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 13/20] END C=0.1, max_iter=100, penalty=l2;, score=0.400 total time=  18.3s
[CV 2/3; 13/20] START C=0.1, max_iter=100, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 13/20] END C=0.1, max_iter=100, penalty=l2;, score=0.403 total time=  19.7s
[CV 3/3; 13/20] START C=0.1, max_iter=100, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 13/20] END C=0.1, max_iter=100, penalty=l2;, score=0.387 total time=  18.8s
[CV 1/3; 14/20] START C=0.1, max_iter=200, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 14/20] END C=0.1, max_iter=200, penalty=l2;, score=0.400 total time=  37.1s
[CV 2/3; 14/20] START C=0.1, max_iter=200, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 14/20] END C=0.1, max_iter=200, penalty=l2;, score=0.403 total time=  39.4s
[CV 3/3; 14/20] START C=0.1, max_iter=200, penalty=l2...........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 14/20] END C=0.1, max_iter=200, penalty=l2;, score=0.387 total time=  37.4s
[CV 1/3; 15/20] START C=0.1, max_iter=500, penalty=l2...........................
[CV 1/3; 15/20] END C=0.1, max_iter=500, penalty=l2;, score=0.400 total time= 1.2min
[CV 2/3; 15/20] START C=0.1, max_iter=500, penalty=l2...........................
[CV 2/3; 15/20] END C=0.1, max_iter=500, penalty=l2;, score=0.403 total time= 1.2min
[CV 3/3; 15/20] START C=0.1, max_iter=500, penalty=l2...........................
[CV 3/3; 15/20] END C=0.1, max_iter=500, penalty=l2;, score=0.387 total time= 1.2min
[CV 1/3; 16/20] START C=0.1, max_iter=1000, penalty=l2..........................
[CV 1/3; 16/20] END C=0.1, max_iter=1000, penalty=l2;, score=0.400 total time= 1.2min
[CV 2/3; 16/20] START C=0.1, max_iter=1000, penalty=l2..........................
[CV 2/3; 16/20] END C=0.1, max_iter=1000, penalty=l2;, score=0.403 total time= 1.2min
[CV 3/3; 16/20] START C=0.1, max_iter=1000, penalty=l2..........................
[C


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 1/3; 17/20] END C=0.01, max_iter=100, penalty=l2;, score=0.398 total time=  18.7s
[CV 2/3; 17/20] START C=0.01, max_iter=100, penalty=l2..........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 2/3; 17/20] END C=0.01, max_iter=100, penalty=l2;, score=0.402 total time=  18.9s
[CV 3/3; 17/20] START C=0.01, max_iter=100, penalty=l2..........................



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[CV 3/3; 17/20] END C=0.01, max_iter=100, penalty=l2;, score=0.386 total time=  19.1s
[CV 1/3; 18/20] START C=0.01, max_iter=200, penalty=l2..........................
[CV 1/3; 18/20] END C=0.01, max_iter=200, penalty=l2;, score=0.398 total time=  31.4s
[CV 2/3; 18/20] START C=0.01, max_iter=200, penalty=l2..........................
[CV 2/3; 18/20] END C=0.01, max_iter=200, penalty=l2;, score=0.402 total time=  32.8s
[CV 3/3; 18/20] START C=0.01, max_iter=200, penalty=l2..........................
[CV 3/3; 18/20] END C=0.01, max_iter=200, penalty=l2;, score=0.386 total time=  34.5s
[CV 1/3; 19/20] START C=0.01, max_iter=500, penalty=l2..........................
[CV 1/3; 19/20] END C=0.01, max_iter=500, penalty=l2;, score=0.398 total time=  31.0s
[CV 2/3; 19/20] START C=0.01, max_iter=500, penalty=l2..........................
[CV 2/3; 19/20] END C=0.01, max_iter=500, penalty=l2;, score=0.402 total time=  32.8s
[CV 3/3; 19/20] START C=0.01, max_iter=500, penalty=l2.........................


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



GridSearchCV(cv=3, error_score=0, estimator=LogisticRegression(),
             param_grid={'C': [100, 10, 1.0, 0.1, 0.01],
                         'max_iter': [100, 200, 500, 1000], 'penalty': ['l2']},
             scoring='accuracy', verbose=10)

In [99]:
log_reg_grid.best_params_

{'C': 10, 'max_iter': 100, 'penalty': 'l2'}

In [101]:
# Logistic Regression

#define model
log_reg = LogisticRegression(
    penalty='l2', 
    C=10, 
    max_iter=100)

scores = cross_val_score(log_reg , X_train, y_train, cv = 3)

log_reg_best_cv_acc = scores.mean()
log_reg_best_cv_acc


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

0.39715737443395577

In [110]:
log_reg = LogisticRegression(
    penalty='l2', 
    C=10, 
    max_iter=100)
log_reg.fit(X_train, y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



LogisticRegression(C=10)

In [125]:
#define model

# get importance
feature_importance = log_reg.coef_[0]

# summarize feature importance
print("Top 10 Important Features for Logistic Regression\n")
print("Attribute".center(35),"Absolute Importance".center(35), sep = "\t")
print("--------------------".center(35),"--------------------".center(35), sep = "\t")
for importance, name in sorted(zip(abs(feature_importance), X_train.columns),reverse=True)[:10]:
    print (name.center(35), format(importance,'.2f').center(35), sep = "\t")
    
# plot feature importance
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': feature_importance
})
importances = importances.sort_values(by='Importance', ascending=False) 
fig9 = px.bar(importances, x='Attribute', y='Importance', labels={
    'Attribute':'Coefficients',
    'Importance':'Importance Score'
})

fig9.update_layout(title="Logistic Regression Feature importances (coefficients)",title_x=0.5)
fig9.show()

py.plot(fig9, filename = "Logistic Regression Feature importances (coefficients)",  auto_open = False)

Top 10 Important Features for Logistic Regression

             Attribute             	        Absolute Importance        
        --------------------       	        --------------------       
       Visitors with Patient       	                2.02               
            Ward_Type_S            	                1.77               
      Type of Admission_Trauma     	                1.48               
            Ward_Type_R            	                1.33               
          Hospital_code_29         	                0.89               
     Severity of Illness_Minor     	                0.89               
             Age_81-90             	                0.84               
          Hospital_code_4          	                0.83               
          Hospital_code_7          	                0.77               
          Hospital_code_24         	                0.77               


'https://plotly.com/~vivekmehendiratta/32/'

In [116]:
abs(feature_importance)

array([9.20696613e-02, 1.28427023e-02, 1.70817497e-01, 6.78596839e-01,
       5.22470981e-01, 6.31939228e-02, 3.19639722e-01, 7.27491394e-01,
       2.26578671e-01, 5.19318435e-02, 5.07340664e-01, 3.18672365e-01,
       5.75791756e-02, 2.51603434e-01, 2.03781942e-01, 7.66029658e-01,
       4.93814167e-01, 3.71128096e-01, 4.00632784e-01, 1.11387066e-01,
       8.88695660e-01, 6.35026654e-01, 2.28964449e-01, 6.68088831e-01,
       1.86060410e-01, 8.34416928e-01, 8.43733104e-02, 6.37515962e-02,
       7.70002308e-01, 4.30890657e-01, 1.39666923e-01, 7.17740377e-02,
       1.78105259e-01, 1.53882830e-01, 1.41607566e-01, 1.86060410e-01,
       2.51603434e-01, 2.12362764e-01, 1.11387066e-01, 2.26578671e-01,
       3.96130285e-02, 2.66041481e-01, 7.15723576e-01, 9.47831747e-02,
       1.40030345e-01, 3.48700941e-01, 1.05274473e-01, 4.00146478e-01,
       3.60824656e-01, 1.40533893e-01, 4.85659021e-01, 2.66375908e-01,
       7.39227870e-01, 3.75185053e-01, 1.33463928e+00, 1.77042453e+00,
      

In [103]:
X_test.shape

(137057, 82)