# Models

In [2]:
#importing libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.metrics import roc_auc_score

In [3]:
df_train = pd.read_csv("..\..\Project_DATA\HealthCareCat_Train.csv")
df_dev = pd.read_csv("..\..\Project_DATA\HealthCareCat_Dev.csv")

#### Drop unnecessary columns

In [4]:

allFeatures = pd.read_csv("..\..\Project_DATA\FeatureSelection_results.csv").Feature.tolist()
selectedFeatures = pd.read_csv("..\..\Project_DATA\SelectedFeatures.csv").Feature.tolist()

In [5]:
featuresToDrop = np.setdiff1d(allFeatures,selectedFeatures)

In [6]:
df_train = df_train.drop(featuresToDrop, axis=1)

In [7]:
df_train.head()

Unnamed: 0,State_County_CD,AGE,SEX,CCS101,CCS104,CCS106,CCS108,CCS257,CCS49,CCS50,...,CCS206,CCS54,CCS164,CCS229,CCS231,CCS45,CCS39,HCC10,HCC19,Spending_Cat
0,22131,66,1,5,2,4,5,35,51,146,...,0,0,0,0,0,0,0,0,0,3
1,22130,87,1,39,4,55,54,9,7,1,...,0,1,1,0,0,0,0,0,0,2
2,22040,68,1,0,0,0,0,2,0,0,...,2,0,0,14,0,19,0,0,0,3
3,22152,75,1,6,0,0,0,0,12,26,...,0,0,0,0,0,0,0,0,0,3
4,22151,75,1,28,0,7,31,7,14,4,...,0,16,5,0,6,0,0,0,0,4


In [8]:
df_dev = df_dev.drop(featuresToDrop, axis=1)

In [9]:
df_dev.head()

Unnamed: 0,State_County_CD,AGE,SEX,CCS101,CCS104,CCS106,CCS108,CCS257,CCS49,CCS50,...,CCS206,CCS54,CCS164,CCS229,CCS231,CCS45,CCS39,HCC10,HCC19,Spending_Cat
0,22170,64,1,0,0,0,5,0,2,4,...,0,0,0,0,0,0,0,0,0,4
1,22020,69,1,0,0,0,0,0,0,0,...,0,0,0,0,0,12,0,0,0,3
2,22170,60,1,0,0,0,1,0,0,0,...,0,0,0,0,0,14,0,0,0,4
3,22067,66,1,5,0,0,0,22,2,0,...,0,0,12,0,0,0,0,0,0,2
4,22053,66,1,3,0,0,12,1,11,4,...,0,0,0,0,0,0,0,0,0,3


#### Partition of the data

In [10]:
X_train = df_train.drop("Spending_Cat",1)   #Feature Matrix
y_train = df_train["Spending_Cat"]          #Target Variable

X_dev = df_dev.drop("Spending_Cat",1)   #Feature Matrix
y_dev = df_dev["Spending_Cat"]          #Target Variable

#### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train,y_train)



LogisticRegression(max_iter=10000)

In [23]:
yhat_train = logreg.predict(X_train)
pd.crosstab(y_train,yhat_train, rownames=['Actual'], colnames=['Predicted'])


Predicted,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,64674,9285,3033,3293
2,25185,31779,14793,8227
3,18812,19855,28846,12703
4,11840,8677,5220,54526


In [24]:
### Accuracy:
logreg.score(X_train,y_train)

0.5606426228690436

In [25]:
yhat_dev = logreg.predict(X_dev)
pd.crosstab(y_dev,yhat_dev, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16058,2273,734,884
2,6438,7950,3737,2125
3,4846,4926,7128,3118
4,2899,2200,1312,13560


In [42]:
### results list

res = []

In [43]:
res.append(
            {'model':'Logistic Regression',
               'accuracy-Train':logreg.score(X_train,y_train),
               'accuracy-Dev':logreg.score(X_dev,y_dev),
               'auc-Train': roc_auc_score(y_train, logreg.predict_proba(X_train), multi_class='ovr'),
               'auc-Dev': roc_auc_score(y_dev, logreg.predict_proba(X_dev), multi_class='ovr')
            }
          )
res

[{'model': 'Logistic Regression',
  'accuracy-Train': 0.5606426228690436,
  'accuracy-Dev': 0.5573901331870105,
  'auc-Train': 0.8104364594867461,
  'auc-Dev': 0.8067707275943214}]

#### Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rf = RandomForestClassifier(max_depth=100, random_state=0)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=100, random_state=0)

In [30]:
yhat_train = rf.predict(X_train)
pd.crosstab(y_train,yhat_train, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,80272,13,0,0
2,4,79980,0,0
3,0,0,80216,0
4,0,0,0,80263


In [31]:
yhat_dev = rf.predict(X_dev)
pd.crosstab(y_dev,yhat_dev, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,18923,1022,4,0
2,335,19915,0,0
3,7,5,20006,0
4,0,0,0,19971


In [44]:
res.append(
            {'model':'Random Forest',
               'accuracy-Train':rf.score(X_train,y_train),
               'accuracy-Dev':rf.score(X_dev,y_dev),
               'auc-Train': roc_auc_score(y_train, rf.predict_proba(X_train), multi_class='ovr'),
               'auc-Dev': roc_auc_score(y_dev, rf.predict_proba(X_dev), multi_class='ovr')
            }
          )
res

[{'model': 'Logistic Regression',
  'accuracy-Train': 0.5606426228690436,
  'accuracy-Dev': 0.5573901331870105,
  'auc-Train': 0.8104364594867461,
  'auc-Dev': 0.8067707275943214},
 {'model': 'Random Forest',
  'accuracy-Train': 0.9999469988900944,
  'accuracy-Dev': 0.9828777373173043,
  'auc-Train': 0.9999997008346434,
  'auc-Dev': 0.9993054075806724}]

#### Gradient Boost

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_depth=100, random_state=0)
gbc.fit(X_train,y_train)


GradientBoostingClassifier(learning_rate=0.3, max_depth=100, random_state=0)

In [46]:
yhat_train = gbc.predict(X_train)
pd.crosstab(y_train,yhat_train, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,80273,12,0,0
2,5,79979,0,0
3,0,0,80216,0
4,0,0,0,80263


In [47]:
yhat_dev = gbc.predict(X_dev)
pd.crosstab(y_dev,yhat_dev, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,18717,1157,64,11
2,445,19792,10,3
3,26,12,19979,1
4,3,0,0,19968


In [48]:
res.append(
            {'model':'Gradient Boost',
               'accuracy-Train':gbc.score(X_train,y_train),
               'accuracy-Dev':gbc.score(X_dev,y_dev),
               'auc-Train': roc_auc_score(y_train, gbc.predict_proba(X_train), multi_class='ovr'),
               'auc-Dev': roc_auc_score(y_dev, gbc.predict_proba(X_dev), multi_class='ovr')
            }
          )
res

[{'model': 'Logistic Regression',
  'accuracy-Train': 0.5606426228690436,
  'accuracy-Dev': 0.5573901331870105,
  'auc-Train': 0.8104364594867461,
  'auc-Dev': 0.8067707275943214},
 {'model': 'Random Forest',
  'accuracy-Train': 0.9999469988900944,
  'accuracy-Dev': 0.9828777373173043,
  'auc-Train': 0.9999997008346434,
  'auc-Dev': 0.9993054075806724},
 {'model': 'Gradient Boost',
  'accuracy-Train': 0.9999469988900944,
  'accuracy-Dev': 0.9784007582181873,
  'auc-Train': 0.9999999932836791,
  'auc-Dev': 0.9989837433047306}]

#### SVM

In [None]:
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler

scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)

svm = svm.SVC(kernel='linear')
svm.fit(X_train,y_train)

In [None]:
yhat_train = svm.predict(X_train)
pd.crosstab(y_train,yhat_train)

In [None]:
svm.score(X_train,y_train)

In [None]:
yhat_dev = svm.predict(X_dev)
pd.crosstab(y_dev,yhat_dev)

In [None]:
svm.score(X_dev,y_dev)