In [6]:
import pandas as pd
import numpy as np

import requests
import pickle
import random
import time
from collections import defaultdict

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf

import patsy
import scipy.stats as stats

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score

%matplotlib inline


In [7]:
with open('suicide1.pickle', 'rb') as read_file:
    df = pickle.load(read_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 527 entries, 1 to 3129
Data columns (total 30 columns):
County_x                              527 non-null object
County_code                           527 non-null float64
rate                                  527 non-null float64
State_x                               527 non-null object
% Fair/Poor Health                    527 non-null int64
% LBW                                 527 non-null float64
% Smokers                             527 non-null int64
% Obese                               527 non-null int64
Food Environment Index                527 non-null float64
% Physically Inactive                 527 non-null int64
% Excessive Drinking                  527 non-null int64
% driving deaths Alcohol-Impaired     527 non-null float64
Teen Birth Rate                       527 non-null float64
% Uninsured                           527 non-null float64
PCP Ratio                             527 non-null float64
Dentist Ratio        

In [8]:
# OLS without patsy

objectdroplist = ['State_x', 'County_code', 'County_x']

X = df.drop(columns=["rate"] + objectdroplist).astype(float)
y = df.loc[:,"rate"].astype(float)

model = sm.OLS(y, sm.add_constant(X), data = df)
results = model.fit() 
results.summary()

0,1,2,3
Dep. Variable:,rate,R-squared:,0.626
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,32.13
Date:,"Mon, 07 Oct 2019",Prob (F-statistic):,1.3699999999999998e-89
Time:,18:39:21,Log-Likelihood:,-1480.4
No. Observations:,527,AIC:,3015.0
Df Residuals:,500,BIC:,3130.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,84.5416,7.721,10.950,0.000,69.372,99.711
% Fair/Poor Health,-0.1711,0.130,-1.319,0.188,-0.426,0.084
% LBW,-0.4291,0.215,-1.995,0.047,-0.852,-0.006
% Smokers,0.5637,0.111,5.065,0.000,0.345,0.782
% Obese,-0.2205,0.072,-3.073,0.002,-0.361,-0.080
Food Environment Index,-2.3634,0.394,-6.002,0.000,-3.137,-1.590
% Physically Inactive,-0.0695,0.077,-0.899,0.369,-0.221,0.082
% Excessive Drinking,-0.2084,0.086,-2.432,0.015,-0.377,-0.040
% driving deaths Alcohol-Impaired,0.0299,0.028,1.067,0.286,-0.025,0.085

0,1,2,3
Omnibus:,70.381,Durbin-Watson:,1.731
Prob(Omnibus):,0.0,Jarque-Bera (JB):,191.1
Skew:,0.66,Prob(JB):,3.1899999999999994e-42
Kurtosis:,5.638,Cond. No.,2570000000.0


In [9]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=.2, random_state=10)

X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=.25, random_state=3)


In [22]:
lm = LinearRegression()
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.fit_transform(X_val.values)
X_test_scaled = scaler.fit_transform(X_test.values)
X_scaled = scaler.fit_transform(X.values)


lm_reg = Ridge(alpha=0.01)


lm.fit(X, y)
print(f'Linear Regression all R^2: {lm.score(X, y):.3f}')
lm_reg.fit(X_scaled, y)
print(f'Ridge Regression all R^2: {lm_reg.score(X_scaled, y):.3f} \n')


lm.fit(X_train, y_train)
print(f'Linear Regression train R^2: {lm.score(X_train, y_train):.3f}')
lm_reg.fit(X_train_scaled, y_train)
print(f'Ridge Regression train R^2: {lm_reg.score(X_train_scaled, y_train):.3f}\n')



lm.fit(X_val, y_val)
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')
lm_reg.fit(X_val_scaled, y_val)
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}\n')



lm.fit(X_test, y_test)
print(f'Linear Regression test R^2: {lm.score(X_test, y_test):.3f}')
lm_reg.fit(X_test_scaled, y_test)
print(f'Ridge Regression test R^2: {lm_reg.score(X_test_scaled, y_test):.3f}')




Linear Regression all R^2: 0.626
Ridge Regression all R^2: 0.626 

Linear Regression train R^2: 0.663
Ridge Regression train R^2: 0.663

Linear Regression val R^2: 0.597
Ridge Regression val R^2: 0.597

Linear Regression test R^2: 0.742
Ridge Regression test R^2: 0.742


In [None]:

param_grid = {'alpha': np.linspace(0.0, 1.0, 100)} #

my_model = Ridge()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
my_grid_search_ridge = GridSearchCV(my_model, param_grid, cv = 5, n_jobs = 1)
my_grid_search_ridge.fit(X_train_scaled, y_train)
my_grid_search_ridge.best_estimator_