# Models

In [4]:
#importing libraries
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

In [5]:
df = pd.read_csv("..\..\Project_DATA\HealthCareCat_flat_1.csv")

In [6]:
df.head()

Unnamed: 0,State_County_CD,AGE,SEX,race,CCS101,CCS104,CCS105,CCS106,CCS108,CCS257,...,HCC154,HCC155,HCC157,HCC158,HCC161,HCC164,HCC174,HCC176,HCC177,Spending_Cat
0,22070,67,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,22150,67,1,1,4,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,22160,53,1,1,14,0,16,2,0,1,...,0,0,0,0,0,0,0,0,0,1
3,22040,67,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,22090,67,1,1,0,0,0,36,3,1,...,0,0,0,0,0,0,0,0,0,1


#### Drop unnecessary columns

In [8]:

allFeatures = pd.read_csv("..\..\Project_DATA\FeatureSelection_results.csv").Feature.tolist()
selectedFeatures = pd.read_csv("..\..\Project_DATA\SelectedFeatures.csv").Feature.tolist()

In [9]:
featuresToDrop = np.setdiff1d(allFeatures,selectedFeatures)

In [10]:
df = df.drop(featuresToDrop, axis=1)

In [11]:
df.head()

Unnamed: 0,SEX,CCS104,CCS10,CCS663,CCS126,CCS200,CCS42,CCS237,CCS2603,CCS47,...,CCS258,CCS86,CCS230,CCS245,CCS226,CCS254,HCC80,HCC130,HCC131,Spending_Cat
0,1,0,3,0,0,5,0,0,0,1,...,3,0,0,0,0,0,0,0,0,1
1,1,4,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1,0,2,4,1,0,0,0,0,0,...,0,0,0,6,0,0,0,0,0,1
3,2,0,0,0,3,6,0,0,0,4,...,5,2,0,0,0,0,0,0,0,1
4,1,0,0,0,0,6,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


#### Partition of the data

In [12]:
X = df.drop("Spending_Cat",1)   #Feature Matrix
y = df["Spending_Cat"]          #Target Variable

In [13]:
### Split the data to three partitions : train, dev and test
### test - 20%, dev - 20%, train - 60%

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=4)

X_train, X_dev, y_train, y_dev = train_test_split(X_train,y_train,test_size=0.25,random_state=4)


In [None]:
print("Train: ")
print(X_train.shape)
print("Dev: ")
print(X_dev.shape)
print("Test: ")
print(X_test.shape)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train,y_train)



In [None]:
yhat_train = logreg.predict(X_train)
pd.crosstab(y_train,yhat_train, rownames=['Actual'], colnames=['Predicted'])


In [None]:
### Accuracy:
logreg.score(X_train,y_train)

In [None]:
yhat_dev = logreg.predict(X_dev)
pd.crosstab(y_dev,yhat_dev, rownames=['Actual'], colnames=['Predicted'])

In [None]:
logreg.score(X_dev,y_dev)

In [None]:
res = []

res.append({'model':'Logistic Regression',
           'accuracy-Train':logreg.score(X_train,y_train),
           'accuracy-Dev':logreg.score(X_dev,y_dev)})

res

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rf = RandomForestClassifier(max_depth=100, random_state=0)
rf.fit(X_train,y_train)

In [None]:
yhat = rf.predict(X_train)
pd.crosstab(y_train,yhat, rownames=['Actual'], colnames=['Predicted'])

In [None]:
yhat_dev = rf.predict(X_dev)
pd.crosstab(y_dev,yhat_dev, rownames=['Actual'], colnames=['Predicted'])

In [None]:
res.append({'model':'Random Forest',
           'accuracy-Train':rf.score(X_train,y_train),
           'accuracy-Dev':rf.score(X_dev,y_dev)})

res

#### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.3, max_depth=10, random_state=0)
gbc.fit(X_train,y_train)


In [None]:
yhat = gbc.predict(X_train)
pd.crosstab(y_train,yhat, rownames=['Actual'], colnames=['Predicted'])

In [None]:
yhat_dev = gbc.predict(X_dev)
pd.crosstab(y_dev,yhat_dev, rownames=['Actual'], colnames=['Predicted'])

In [None]:
res.append({'model':'Gradient Boost',
           'accuracy-Train':gbc.score(X_train,y_train),
           'accuracy-Dev':gbc.score(X_dev,y_dev)})

res

#### SVM

In [None]:
from sklearn import svm

svm = svm.SVC(kernel='linear')
svm.fit(X_train,y_train)

In [None]:
yhat = svm.predict(X_train)
pd.crosstab(y_train,yhat)

In [None]:
svm.score(X_train,y_train)