In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

# SVM

In [89]:
my_model = ['ID','year','sex','HE_sbp', 'BS3_1', 'BP1',
            'HE_Upro','HE_crea','BE3_21','HE_glu','DI2_dg',
           'DI1_2','DI1_pt','DE1_32','HE_fh','HE_STRfh1','HE_HPfh1',
           'HE_HLfh2','DF2_dg','HE_ht_wc','DI3_dg']

In [90]:
df = pd.read_csv('data_model4_balancing.csv', usecols=my_model)

In [91]:
df.head()

Unnamed: 0,ID,year,sex,BP1,HE_sbp,DI1_2,DI1_pt,HE_ht_wc,DI2_dg,HE_glu,...,HE_crea,BS3_1,BE3_21,DE1_32,DF2_dg,HE_fh,HE_STRfh1,HE_HLfh2,HE_HPfh1,DI3_dg
0,A901963701,2010,1,3,138,0,0,0.499704,0,102,...,1.03,0,4,0,0,1,0,0,0,0
1,A901963702,2010,2,3,120,0,0,0.415679,0,85,...,0.69,0,1,0,0,1,0,0,1,0
2,A901963703,2010,2,3,108,0,0,0.431078,0,92,...,0.7,0,4,0,0,1,0,0,1,0
3,A901963802,2010,2,3,115,0,0,0.583607,0,94,...,0.75,0,1,0,0,0,0,0,0,0
4,A901964102,2010,2,3,110,0,0,0.472947,0,91,...,0.7,0,1,0,0,0,0,0,0,0


In [92]:
y = df.DI3_dg
X = df.drop(['DI3_dg','ID','year'], axis=1)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=621)

In [94]:
X_train_scaled = StandardScaler().fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [95]:
svm_rbf_reg = SVR(kernel="rbf", C=120)
svm_rbf_reg.fit(X_train_scaled, y_train)
svm_rbf_reg.score(X_train_scaled, y_train)


0.9023081820896769

In [96]:
from sklearn.model_selection import cross_val_score
y_val_pred = cross_val_score(svm_rbf_reg, X_train, y_train, cv=3)
y_val_pred



array([0.87447496, 0.88456079, 0.86443967])

In [97]:
X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = svm_rbf_reg.predict(X_test_scaled)
#accuracy_score(y_test, y_pred.round())
accuracy_score(y_test, (y_pred>0.8).astype(int))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.9761075161772026

# RandomizedSearchCV

In [66]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"C": uniform(100, 500), }
svm_rbf_reg2 = SVR(kernel="rbf")
rnd_search_cv = RandomizedSearchCV(svm_rbf_reg2, param_distributions, cv=3, n_iter=10, verbose=2, n_jobs=-1)
rnd_search_cv.fit(X_train_scaled, y_train)
rnd_search_cv.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  1.2min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.6min finished


0.7240856660638412

In [67]:
rnd_search_cv.best_estimator_

SVR(C=120.93361759915553, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

### DI4_dg

In [98]:
my_model2 = ['ID','year','sex','HE_sbp', 'BS3_1', 'BP1',
            'HE_Upro','HE_crea','BE3_21','HE_glu','DI2_dg',
           'DI1_2','DI1_pt','DE1_32','HE_fh','HE_STRfh1','HE_HPfh1',
           'HE_HLfh2','DF2_dg','HE_ht_wc','DI4_dg']

In [99]:
df2 = pd.read_csv('data_model4_balancing.csv', usecols=my_model2)

In [100]:
df2.head()

Unnamed: 0,ID,year,sex,BP1,HE_sbp,DI1_2,DI1_pt,HE_ht_wc,DI2_dg,HE_glu,...,HE_crea,BS3_1,BE3_21,DE1_32,DF2_dg,HE_fh,HE_STRfh1,HE_HLfh2,HE_HPfh1,DI4_dg
0,A901963701,2010,1,3,138,0,0,0.499704,0,102,...,1.03,0,4,0,0,1,0,0,0,0
1,A901963702,2010,2,3,120,0,0,0.415679,0,85,...,0.69,0,1,0,0,1,0,0,1,0
2,A901963703,2010,2,3,108,0,0,0.431078,0,92,...,0.7,0,4,0,0,1,0,0,1,0
3,A901963802,2010,2,3,115,0,0,0.583607,0,94,...,0.75,0,1,0,0,0,0,0,0,0
4,A901964102,2010,2,3,110,0,0,0.472947,0,91,...,0.7,0,1,0,0,0,0,0,0,0


In [101]:
y = df2.DI4_dg
X = df2.drop(['DI4_dg','ID','year'], axis=1)

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=621)

In [103]:
X_train_scaled = StandardScaler().fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [104]:
svm_rbf_reg = SVR(kernel="rbf", C=115)
svm_rbf_reg.fit(X_train_scaled, y_train)
svm_rbf_reg.score(X_train_scaled, y_train)

0.8541799123713545

In [105]:
from sklearn.model_selection import cross_val_score
y_val_pred = cross_val_score(svm_rbf_reg, X_train, y_train, cv=3)
y_val_pred



array([0.84767035, 0.84987347, 0.83903616])

In [106]:
X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = svm_rbf_reg.predict(X_test_scaled)
#accuracy_score(y_test, y_pred.round())
accuracy_score(y_test, (y_pred>0.8).astype(int))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


0.9422598307615729

In [83]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"C": uniform(100, 500), }
svm_rbf_reg2 = SVR(kernel="rbf")
rnd_search_cv = RandomizedSearchCV(svm_rbf_reg2, param_distributions, cv=3, n_iter=10, verbose=2, n_jobs=-1)
rnd_search_cv.fit(X_train_scaled, y_train)
rnd_search_cv.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  1.6min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.1min finished


0.6103185758330809

In [115]:
rnd_search_cv.best_params_

{'C': 114.52969238830687}

In [114]:
#X_test_scaled = StandardScaler().fit_transform(X_test) 
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_pred.round())
#accuracy_score(y_test, (y_pred>0.8).astype(int))

0.9148830263812843