In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Logistic Regression
## DI3_dg

In [2]:
my_model = ['year','sex','HE_sbp','DI1_pt','HE_HLdr','BP1','HE_ht_wc','DI2_dg', 'DI3_dg']

In [3]:
df = pd.read_csv('data_model3_balancing.csv', usecols=my_model)

In [4]:
df.head()

Unnamed: 0,year,sex,HE_sbp,DI1_pt,HE_HLdr,BP1,HE_ht_wc,DI3_dg,DI2_dg
0,2013,1,122,0,0,3,0.840146,0,0
1,2013,2,117,0,0,3,0.737889,0,0
2,2013,1,118,0,0,3,1.039291,0,0
3,2013,2,119,0,0,3,0.940208,0,0
4,2013,2,117,0,0,3,0.897032,0,0


In [5]:
df.describe()

Unnamed: 0,year,sex,HE_sbp,DI1_pt,HE_HLdr,BP1,HE_ht_wc,DI3_dg,DI2_dg
count,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0,8766.0
mean,2012.4376,1.510951,122.62058,0.39619,0.001483,3.247547,1.00199,1.902008,0.259411
std,0.837196,0.499909,18.186566,0.489133,0.038483,0.431612,0.289217,3.183687,0.438337
min,2010.0,1.0,75.0,0.0,0.0,3.0,0.001228,0.0,0.0
25%,2012.0,1.0,109.0,0.0,0.0,3.0,0.868548,0.0,0.0
50%,2013.0,2.0,120.0,0.0,0.0,3.0,1.002343,0.0,0.0
75%,2013.0,2.0,134.0,1.0,0.0,3.0,1.168678,1.0,1.0
max,2013.0,2.0,221.0,1.0,1.0,4.0,2.035994,8.0,1.0


In [6]:
y = df.DI3_dg
X = df.drop(['DI3_dg'], axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=621)

In [8]:
X_train_scaled = StandardScaler().fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
X_train_scaled.shape

(7012, 8)

In [10]:
log_reg = LogisticRegression(random_state=621)
log_reg.fit(X_train_scaled, y_train)
log_reg.score(X_train_scaled, y_train)



0.7728180262407301

In [11]:
X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = log_reg.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.7765108323831242

## Grid Search

In [19]:
param_grid = [{
    'penalty': ['l1','l2'], 
    'C': [0.001,0.01,0.1,1,10,100,1000]
}]
log_reg2 = LogisticRegression(random_state=621)

In [20]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(log_reg2, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)





GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=621, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
grid_search.best_params_

{'C': 0.001, 'penalty': 'l2'}

In [22]:
grid_search.best_score_

0.7760981175128352

In [23]:
grid_search.best_estimator_.fit(X_train_scaled, y_train)
y_pred = grid_search.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred)



0.7741015402167712

In [24]:
y_pred = grid_search.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.7839224629418472

# Support Vector Regression (SVR) 
## DI3_dg

In [12]:
from sklearn.svm import SVR
svm_rbf_reg = SVR(kernel="rbf", C=120)
svm_rbf_reg.fit(X_train_scaled, y_train)
svm_rbf_reg.score(X_train_scaled, y_train)

0.7200871604778862

In [13]:
from sklearn.model_selection import cross_val_score
y_val_pred = cross_val_score(svm_rbf_reg, X_train, y_train, cv=3)
y_val_pred



array([0.76557766, 0.78455227, 0.79735625])

In [14]:
X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = svm_rbf_reg.predict(X_test_scaled)
accuracy_score(y_test, y_pred.round())
#accuracy_score(y_test, (y_pred>0.8).astype(int))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.7542759407069556

## RandomizedSearchCV

In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"C": uniform(1, 1000), }
svm_rbf_reg2 = SVR(kernel="rbf")
rnd_search_cv = RandomizedSearchCV(svm_rbf_reg2, param_distributions, cv=3, n_iter=10, verbose=2, n_jobs=-1)
rnd_search_cv.fit(X_train_scaled, y_train)
rnd_search_cv.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  3.9min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.5min finished


0.45730366703942393

In [31]:
rnd_search_cv.best_estimator_

SVR(C=37.77872414801164, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred.round())
#accuracy_score(y_test, (y_pred>0.8).astype(int))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.878563283922463

# Logistic Regreesion
## DI4_dg

In [15]:
my_model2 = ['year','sex','HE_sbp','DI1_pt','HE_HLdr','BP1','HE_ht_wc','DI2_dg','DI4_dg']

In [16]:
df2 = pd.read_csv('data_model3_balancing.csv', usecols=my_model2)

In [17]:
df2.head()

Unnamed: 0,year,sex,HE_sbp,DI1_pt,HE_HLdr,BP1,HE_ht_wc,DI4_dg,DI2_dg
0,2013,1,122,0,0,3,0.840146,0,0
1,2013,2,117,0,0,3,0.737889,0,0
2,2013,1,118,0,0,3,1.039291,0,0
3,2013,2,119,0,0,3,0.940208,0,0
4,2013,2,117,0,0,3,0.897032,0,0


In [18]:
y = df2.DI4_dg
X = df2.drop(['DI4_dg'], axis=1)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=621)

In [20]:
X_train_scaled = StandardScaler().fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
log_reg = LogisticRegression(random_state=621)
log_reg.fit(X_train_scaled, y_train)
log_reg.score(X_train_scaled, y_train)



0.7940673131774102

In [22]:
X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = log_reg.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.7964652223489168

# Support Vector Regression (SVR) 
## DI4_dg

In [23]:
svm_rbf_reg = SVR(kernel="rbf", C=115)
svm_rbf_reg.fit(X_train_scaled, y_train)
svm_rbf_reg.score(X_train_scaled, y_train)

0.580769770523563

In [24]:
from sklearn.model_selection import cross_val_score
y_val_pred = cross_val_score(svm_rbf_reg, X_train, y_train, cv=3)
y_val_pred



array([0.69250423, 0.670021  , 0.74145986])

In [25]:
X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = svm_rbf_reg.predict(X_test_scaled)
accuracy_score(y_test, y_pred.round())
#accuracy_score(y_test, (y_pred>0.8).astype(int))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.7633979475484607

### Random Search

In [52]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"C": uniform(100, 500), }
svm_rbf_reg2 = SVR(kernel="rbf")
rnd_search_cv = RandomizedSearchCV(svm_rbf_reg2, param_distributions, cv=3, n_iter=10, verbose=2, n_jobs=-1)
rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  1.2min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F04DB177F0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [53]:
rnd_search_cv.best_params_

{'C': 151.05303291359408}

In [54]:
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred.round())

0.8574686431014823

In [55]:
rnd_search_cv.best_score_

0.32562686807483116