# Notebook Showing Linear Regression

In [14]:
import os

import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.metrics import r2_score

import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

In [15]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)

In [16]:
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')

In [18]:
disease = ['SP_ALZHDMTA',
       'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN',
       'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']

In [20]:
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']

In [27]:
X = df[disease+gender+ESRD]
y = df.TOTAL_LOG_PAID

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=314)

In [55]:
X.columns

Index(['SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD',
       'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA',
       'SP_STRKETIA', 'gender_2', 'ESRD_Y'],
      dtype='object')

In [67]:
lm = linear_model.LinearRegression()

lm.fit(X_train, y_train)

y_pred_train = lm.predict(X_train)

y_pred_test = lm.predict(X_test)

print('Training R^2: {:,.3}%'.format(r2_score(y_train, y_pred_train)*100))

print('Test R^2: {:,.3}%'.format(r2_score(y_test, y_pred_test)*100))

print('Intercept: {:,.2}'.format(lm.intercept_))

print('Coefficients: ' + ', '.join(['{}: {:,.2}'.format(col, coef_val) for col, coef_val in zip(X.columns, lm.coef_)]))

Training R^2: 50.1%
Test R^2: 49.9%
Intercept: 6.1
Coefficients: SP_ALZHDMTA: 0.35, SP_CHF: 0.42, SP_CHRNKIDN: 0.67, SP_CNCR: 0.54, SP_COPD: 0.42, SP_DEPRESSN: 0.34, SP_DIABETES: 0.49, SP_ISCHMCHT: 0.59, SP_OSTEOPRS: 0.27, SP_RA_OA: 0.39, SP_STRKETIA: 0.41, gender_2: 0.0086, ESRD_Y: 0.23


In [40]:
X_train_df = pd.DataFrame(X_train, columns=X.columns)

In [41]:
X_train.loc[:, 'TOTAL_LOG_PAID'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [42]:
mod = smf.ols(formula='TOTAL_LOG_PAID ~ gender_2 + ESRD_Y + SP_ALZHDMTA + SP_CHF + SP_CHRNKIDN + SP_CNCR + SP_COPD + SP_DEPRESSN + SP_DIABETES + SP_ISCHMCHT + SP_OSTEOPRS + SP_RA_OA + SP_STRKETIA', data=X_train_df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:         TOTAL_LOG_PAID   R-squared:                       0.501
Model:                            OLS   Adj. R-squared:                  0.499
Method:                 Least Squares   F-statistic:                     216.8
Date:                Fri, 15 Jun 2018   Prob (F-statistic):               0.00
Time:                        11:45:05   Log-Likelihood:                -4302.0
No. Observations:                2820   AIC:                             8632.
Df Residuals:                    2806   BIC:                             8715.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       6.0646      0.042    143.445      