## Support Vector Machines (SVM) 

In [1]:
# No warnings
import warnings
warnings.filterwarnings('ignore') # Filter out warnings

# data analysis and wrangling
import pandas as pd
import numpy as np
from numpy import mean
import matplotlib.pyplot as plt

## modeling
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import balanced_accuracy_score

Import files below:

1. data1 = stepcount + demographic data in discrete categories
2. d1 = demographic data in discrete categories
3. data2 = stepcount + demographic data as continuous variables
4. d2 = demographic data as a continuous variable

In [2]:
file1 = 'no_band_steps.xlsx'
data1 = pd.read_excel(file1)
data1 = data1.set_index('studyID')

d1 = data1.drop(columns = 'Steps')

file2 = 'band_steps.xlsx'
data2 = pd.read_excel(file2)
data2 = data2.set_index('studyID')

d2 = data2.drop(columns = 'Steps_band')

Proportion of 1's to 0's

In [3]:
print('The number of 1s is: ', sum(d1.Weight_loss_band == 1))
print('The number of 0s is: ', sum(d1.Weight_loss_band == 0))
print('The proportion of positive cases is: ', sum(d1.Weight_loss_band == 1)/(sum(d1.Weight_loss_band == 1) + sum(d1.Weight_loss_band == 0)))

The number of 1s is:  19
The number of 0s is:  38
The proportion of positive cases is:  0.3333333333333333


The above results indicate that the dataset has a class imbalance in favor of participants who did not lose weight.

### Demographic: No Band

This section evaluates the performance of models utilizing only demographic information that is continuous.

In [6]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
d1_train_acc = []
d1_test_acc = []
d1_roc = []
d1_pr = []
d1_ba = []

for i in random_state :
    
    train_df, test_df = train_test_split(d1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 3)
    clf.fit(X_train, Y_train)
    
    
    X_pred = clf.predict(X_test)
    #y_pred = cl.predict_proba(Y_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')
    ba = balanced_accuracy_score(Y_test, X_pred)
    
    d1_train_acc.append(svm_train_acc)
    d1_test_acc.append(svm_test_acc)
    d1_roc.append(roc)
    d1_pr.append(pr)
    d1_ba.append(ba)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)
    #print('BA score:', ba)

    #print('Mean Training Accuracy:', mean(d1_train_acc)) 
    #print('Mean Test Accuracy:', mean(d1_test_acc)) 
    #print('Mean ROC AUC score:', mean(d1_roc))
    #print('Mean PR AUC score:', mean(d1_pr))
    #print('Mean BA score:', mean(d1_ba))
    
    

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

d1_roc = []
d1_ba = []
d1_p = []
d1_r = []
d1_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(d1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 3)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    d1_roc.append(roc)
    d1_ba.append(ba)
    d1_p.append(precision)
    d1_r.append(recall)
    d1_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(d1_p))
#print('Mean Recall:', mean(d1_r))
#print('Mean f1:', mean(d1_f1))
#print('Mean ROC AUC score:', mean(d1_roc))
#print('Mean BA score:', mean(d1_ba))       
    

### Demographic: Band

This section evaluates the performance of models utilizing only demographic information that is discretized.

In [8]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
d2_train_acc = []
d2_test_acc = []
d2_roc = []
d2_pr = []


for i in random_state :
    
    train_df, test_df = train_test_split(d2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)  

    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')
    
    d2_train_acc.append(svm_train_acc)
    d2_test_acc.append(svm_test_acc)
    d2_roc.append(roc)
    d2_pr.append(pr)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)



#print('Mean Training Accuracy:', mean(d2_train_acc)) 
#print('Mean Test Accuracy:', mean(d2_test_acc)) 
#print('Mean ROC AUC score:', mean(d2_roc))
#print('Mean PR AUC score:', mean(d2_pr))

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

d2_roc = []
d2_ba = []
d2_p = []
d2_r = []
d2_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(d2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    d2_roc.append(roc)
    d2_ba.append(ba)
    d2_p.append(precision)
    d2_r.append(recall)
    d2_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(d2_p))
#print('Mean Recall:', mean(d2_r))
#print('Mean f1:', mean(d2_f1))
#print('Mean ROC AUC score:', mean(d2_roc))
#print('Mean BA score:', mean(d2_ba))    
    

Steps + Demographic: No Band

from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
data1_train_acc = []
data1_test_acc = []
data1_roc = []
data1_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(data1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 3)
    clf.fit(X_train, Y_train)  
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')

    
    data1_train_acc.append(svm_train_acc)
    data1_test_acc.append(svm_test_acc)
    data1_roc.append(roc)
    data1_pr.append(pr)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)
    
    print('ROC AUC SCORE:', roc)
    print('PR AUC SCORE:', pr)


print('Mean Training Accuracy:', mean(data1_train_acc)) 
print('Mean Test Accuracy:', mean(data1_test_acc))
print('Mean ROC AUC score:', mean(data1_roc))
print('Mean PR AUC score:', mean(data1_pr))

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

data1_roc = []
data1_ba = []
data1_p = []
data1_r = []
data1_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(data1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 3)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    data1_roc.append(roc)
    data1_ba.append(ba)
    data1_p.append(precision)
    data1_r.append(recall)
    data1_f1.append(f1)
    
    print('Random State: ', i)
    
    print('Precision:', precision)
    print('Recall:', recall)
    print('f1:', f1)
    
    print('ROC AUC SCORE:', roc)
    print('BA score:', ba, '\n')


print('Mean Precision:', mean(data1_p))
print('Mean Recall:', mean(data1_r))
print('Mean f1:', mean(data1_f1))
print('Mean ROC AUC score:', mean(data1_roc))
print('Mean BA score:', mean(data1_ba))    
    

### Steps + Demographic: Band

This section evaluates the performance of models utilizing demographic and stepcount information that is discrete.

In [10]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
data2_train_acc = []
data2_test_acc = []
data2_roc = []
data2_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(data2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 4)
    clf.fit(X_train, Y_train)
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')

    
    data2_train_acc.append(svm_train_acc)
    data2_test_acc.append(svm_test_acc)
    data2_roc.append(roc)
    data2_pr.append(pr)

    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)



    #print('Mean Training Accuracy:', mean(data2_train_acc)) 
    #print('Mean Test Accuracy:', mean(data2_test_acc)) 
    #print('Mean ROC AUC score:', mean(data2_roc))
    #print('Mean PR AUC score:', mean(data2_pr))

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

data2_roc = []
data2_ba = []
data2_p = []
data2_r = []
data2_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(data2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 4)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    data2_roc.append(roc)
    data2_ba.append(ba)
    data2_p.append(precision)
    data2_r.append(recall)
    data2_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(data2_p))
#print('Mean Recall:', mean(data2_r))
#print('Mean f1:', mean(data2_f1))
#print('Mean ROC AUC score:', mean(data2_roc))
#print('Mean BA score:', mean(data2_ba))    
    

Important files below:

1. no_band_final = KEGG + Demographic + stepcount data as continuous variables
2. norm_no_band_final = KEGG + Demographic + stepcount data as continuous variables
3. band_final = KEGG + Demographic + stepcount data as discrete variables
4. norm_band_final = KEGG + Demographic + stepcount data as discrete variables

In [3]:
no_band_final = pd.read_excel('final_no_band.xlsx')
no_band_final = no_band_final.set_index('studyID')

norm_no_band_final = pd.read_excel('norm_final_no_band.xlsx')
norm_no_band_final = norm_no_band_final.set_index('studyID')

band_final = pd.read_excel('final_band.xlsx')
band_final = band_final.set_index('studyID')

norm_band_final = pd.read_excel('norm_final_band.xlsx')
norm_band_final = norm_band_final.set_index('studyID')

### KEGG + Demographic: No Band

This section evaluates the performance of models utilizing KEGG features and demographic information that is continuous.

In [13]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
no_band_train_acc = []
no_band_test_acc = []
no_band_roc = []
no_band_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train) 
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')
    
    no_band_train_acc.append(svm_train_acc)
    no_band_test_acc.append(svm_test_acc)
    no_band_roc.append(roc)
    no_band_pr.append(pr)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)


#print('Mean Training Accuracy:', mean(no_band_train_acc)) 
#print('Mean Test Accuracy:', mean(no_band_test_acc)) 
#print('Mean ROC AUC score:', mean(no_band_roc))
#print('Mean PR AUC score:', mean(no_band_pr))

In [14]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

no_band_roc = []
no_band_ba = []
no_band_p = []
no_band_r = []
no_band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    no_band_roc.append(roc)
    no_band_ba.append(ba)
    no_band_p.append(precision)
    no_band_r.append(recall)
    no_band_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(no_band_p))
#print('Mean Recall:', mean(no_band_r))
#print('Mean f1:', mean(no_band_f1))
#print('Mean ROC AUC score:', mean(no_band_roc))
#print('Mean BA score:', mean(no_band_ba))    
    

### KEGG + Demographic: No Band, Normalized

This section evaluates the performance of models utilizing KEGG features and demographic information that is continuous and normalized.

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

norm_no_band_roc = []
norm_no_band_ba = []
norm_no_band_p = []
norm_no_band_r = []
norm_no_band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_no_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    norm_no_band_roc.append(roc)
    norm_no_band_ba.append(ba)
    norm_no_band_p.append(precision)
    norm_no_band_r.append(recall)
    norm_no_band_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(norm_no_band_p))
#print('Mean Recall:', mean(norm_no_band_r))
#print('Mean f1:', mean(norm_no_band_f1))
#print('Mean ROC AUC score:', mean(norm_no_band_roc))
#print('Mean BA score:', mean(norm_no_band_ba))    
    

### KEGG + Demographic: Band

This section evaluates the performance of models utilizing KEGG features and demographic information that is discrete.

In [16]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
band_train_acc = []
band_test_acc = []
band_roc = []
band_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)  
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')
    
    band_train_acc.append(svm_train_acc)
    band_test_acc.append(svm_test_acc)
    band_roc.append(roc)
    band_pr.append(pr)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)


#print('Mean Training Accuracy:', mean(band_train_acc)) 
#print('Mean Test Accuracy:', mean(band_test_acc)) 
#print('Mean ROC AUC score:', mean(band_roc))
#print('Mean PR AUC score:', mean(band_pr))

In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

band_roc = []
band_ba = []
band_p = []
band_r = []
band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    band_roc.append(roc)
    band_ba.append(ba)
    band_p.append(precision)
    band_r.append(recall)
    band_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(band_p))
#print('Mean Recall:', mean(band_r))
#print('Mean f1:', mean(band_f1))
#print('Mean ROC AUC score:', mean(band_roc))
#print('Mean BA score:', mean(band_ba))    
    

### KEGG + Demographic: Band, Normalized

This section evaluates the performance of models utilizing KEGG features and demographic information that is discrete and normalized.

In [18]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_band_train_acc = []
norm_band_test_acc = []
norm_band_roc = []
norm_band_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')

    norm_band_train_acc.append(svm_train_acc)
    norm_band_test_acc.append(svm_test_acc)
    norm_band_roc.append(roc)
    norm_band_pr.append(pr)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)


#print('Mean Training Accuracy:', mean(norm_band_train_acc)) 
#print('Mean Test Accuracy:', mean(norm_band_test_acc))
#print('Mean ROC AUC score:', mean(norm_band_roc))
#print('Mean PR AUC score:', mean(norm_band_pr))

In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

norm_band_roc = []
norm_band_ba = []
norm_band_p = []
norm_band_r = []
norm_band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    norm_band_roc.append(roc)
    norm_band_ba.append(ba)
    norm_band_p.append(precision)
    norm_band_r.append(recall)
    norm_band_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(norm_band_p))
#print('Mean Recall:', mean(norm_band_r))
#print('Mean f1:', mean(norm_band_f1))
#print('Mean ROC AUC score:', mean(norm_band_roc))
#print('Mean BA score:', mean(norm_band_ba))

### Import PCA dataset and retain only first 6 PC's which account for 80% variance

In [5]:
PCA = pd.read_csv('PCA_final.csv')

In [6]:
PCA.index = data1.index

In [7]:
PCA = PCA.iloc[:, 1:7]
PCA.head(1)

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6
studyID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001,-469.941805,0.049335,0.010597,-0.025762,0.020042,-0.004671


Merge PCA dataframe with demographic and step data

Important files below:

1. no_band_PCA = PCA + Demographic + stepcount as continuous variables
2. norm_no_band_PCA = PCA + Demographic + stepcount, Normalized as continuous variables
3. band_PCA = PCA + Demographic + stepcount as discrete variables
4. norm_band_PCA = PCA + Demographic + stepcount, Normalized as discrete variables


In [8]:
no_band_PCA = pd.concat([PCA, data1], axis = 1, join = 'outer')
band_PCA = pd.concat([PCA, data2], axis = 1, join = 'outer')


In [9]:
from sklearn.preprocessing import MinMaxScaler

norm = MinMaxScaler()

norm_no_band_PCA = pd.DataFrame(norm.fit_transform(no_band_PCA), index = no_band_PCA.index, columns = no_band_PCA.columns.values)
norm_band_PCA = pd.DataFrame(norm.fit_transform(band_PCA), index = band_PCA.index, columns = band_PCA.columns.values)




PCA + Demographic: No Band

from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
no_band_train_PCA = []
no_band_test_PCA = []
no_band_PCA_roc = []
no_band_PCA_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')
    
    no_band_train_PCA.append(svm_train_acc)
    no_band_test_PCA.append(svm_test_acc)
    no_band_PCA_roc.append(roc)
    no_band_PCA_pr.append(pr)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)
    
    print('ROC AUC SCORE:', roc)
    print('PR AUC SCORE:', pr)


print('Mean Training Accuracy:', mean(no_band_train_PCA)) 
print('Mean Test Accuracy:', mean(no_band_test_PCA))
print('Mean ROC AUC score:', mean(no_band_PCA_roc))
print('Mean PR AUC score:', mean(no_band_PCA_pr))

In [25]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

no_band_PCA_roc = []
no_band_PCA_ba = []
no_band_PCA_p = []
no_band_PCA_r = []
no_band_PCA_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 1)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    no_band_PCA_roc.append(roc)
    no_band_PCA_ba.append(ba)
    no_band_PCA_p.append(precision)
    no_band_PCA_r.append(recall)
    no_band_PCA_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(no_band_PCA_p))
#print('Mean Recall:', mean(no_band_PCA_r))
#print('Mean f1:', mean(no_band_PCA_f1))
#print('Mean ROC AUC score:', mean(no_band_PCA_roc))
#print('Mean BA score:', mean(no_band_PCA_ba))

### PCA + Demographic: No Band, Normalized

This section evaluates the performance of models utilizing PCA features and demographic information that is continuous and normalized.

In [26]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_no_band_train_PCA = []
norm_no_band_test_PCA = []
norm_no_band_PCA_roc = []
norm_no_band_PCA_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 5)
    clf.fit(X_train, Y_train) 
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')

    norm_no_band_train_PCA.append(svm_train_acc)
    norm_no_band_test_PCA.append(svm_test_acc)
    norm_no_band_PCA_roc.append(roc)
    norm_no_band_PCA_pr.append(pr)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)

#print('Mean Training Accuracy:', mean(norm_no_band_train_PCA)) 
#print('Mean Test Accuracy:', mean(norm_no_band_test_PCA)) 
#print('Mean ROC AUC score:', mean(norm_no_band_PCA_roc))
#print('Mean PR AUC score:', mean(norm_no_band_PCA_pr))

In [27]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

norm_no_band_PCA_roc = []
norm_no_band_PCA_ba = []
norm_no_band_PCA_p = []
norm_no_band_PCA_r = []
norm_no_band_PCA_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 1)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    norm_no_band_PCA_roc.append(roc)
    norm_no_band_PCA_ba.append(ba)
    norm_no_band_PCA_p.append(precision)
    norm_no_band_PCA_r.append(recall)
    norm_no_band_PCA_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(norm_no_band_PCA_p))
#print('Mean Recall:', mean(norm_no_band_PCA_r))
#print('Mean f1:', mean(norm_no_band_PCA_f1))
#print('Mean ROC AUC score:', mean(norm_no_band_PCA_roc))
#print('Mean BA score:', mean(norm_no_band_PCA_ba))

### PCA + Demographic: Band

This section evaluates the performance of models utilizing PCA features and demographic information that is discrete.

In [28]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
band_train_PCA = []
band_test_PCA = []
band_PCA_roc = []
band_PCA_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')

    
    band_train_PCA.append(svm_train_acc)
    band_test_PCA.append(svm_test_acc)
    band_PCA_roc.append(roc)
    band_PCA_pr.append(pr)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)


#print('Mean Training Accuracy:', mean(band_train_PCA)) 
#print('Mean Test Accuracy:', mean(band_test_PCA)) 
#print('Mean ROC AUC score:', mean(band_PCA_roc))
#print('Mean PR AUC score:', mean(norm_no_band_PCA_pr))

In [29]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

band_PCA_roc = []
band_PCA_ba = []
band_PCA_p = []
band_PCA_r = []
band_PCA_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    band_PCA_roc.append(roc)
    band_PCA_ba.append(ba)
    band_PCA_p.append(precision)
    band_PCA_r.append(recall)
    band_PCA_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(band_PCA_p))
#print('Mean Recall:', mean(band_PCA_r))
#print('Mean f1:', mean(band_PCA_f1))
#print('Mean ROC AUC score:', mean(band_PCA_roc))
#print('Mean BA score:', mean(band_PCA_ba))

### PCA + Demographic: Band, Normalized

This section evaluates the performance of models utilizing PCA features and demographic information that is discrete and normalized.

In [30]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_band_train_PCA = []
norm_band_test_PCA = []
norm_band_PCA_roc = []
norm_band_PCA_pr = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 4)
    clf.fit(X_train, Y_train)  
    
    X_pred = clf.predict(X_test)

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    roc = roc_auc_score(Y_test, X_pred)
    pr = average_precision_score(Y_test, X_pred, average = 'weighted')
    
    norm_band_train_PCA.append(svm_train_acc)
    norm_band_test_PCA.append(svm_test_acc)
    norm_band_PCA_roc.append(roc)
    norm_band_PCA_pr.append(pr)
    
    #print('Random State: ', i)
    
    #print('Training Accuracy:', svm_train_acc)
    
    #print('Test Accuracy:', svm_test_acc)
    
    #print('ROC AUC SCORE:', roc)
    #print('PR AUC SCORE:', pr)


#print('Mean Training Accuracy:', mean(norm_band_train_PCA)) 
#print('Mean Test Accuracy:', mean(norm_band_test_PCA)) 
#print('Mean ROC AUC score:', mean(norm_band_PCA_roc))
#print('Mean PR AUC score:', mean(norm_band_PCA_pr))

In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score


random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

norm_band_PCA_roc = []
norm_band_PCA_ba = []
norm_band_PCA_p = []
norm_band_PCA_r = []
norm_band_PCA_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
        
    clf = SVC(kernel = 'poly', gamma = 'scale', degree = 7)
    clf.fit(X_train, Y_train)

    X_pred = cross_val_predict(clf, X_train, Y_train, cv=5)
    
    conf = confusion_matrix(Y_train, X_pred)
    
    precision = precision_score(Y_train, X_pred)
    recall = recall_score(Y_train, X_pred)
    f1 = f1_score(Y_train, X_pred)
    
    roc = roc_auc_score(Y_train, X_pred)
    ba = balanced_accuracy_score(Y_train, X_pred)
    
    norm_band_PCA_roc.append(roc)
    norm_band_PCA_ba.append(ba)
    norm_band_PCA_p.append(precision)
    norm_band_PCA_r.append(recall)
    norm_band_PCA_f1.append(f1)
    
    #print('Random State: ', i)
    
    #print('Precision:', precision)
    #print('Recall:', recall)
    #print('f1:', f1)
    
    #print('ROC AUC SCORE:', roc)
    #print('BA score:', ba, '\n')


#print('Mean Precision:', mean(norm_band_PCA_p))
#print('Mean Recall:', mean(norm_band_PCA_r))
#print('Mean f1:', mean(norm_band_PCA_f1))
#print('Mean ROC AUC score:', mean(norm_band_PCA_roc))
#print('Mean BA score:', mean(norm_band_PCA_ba))

The following code creates the model results dataframe

In [35]:
results = [d1_train_acc, d1_test_acc, #excludes data1, norm_no_band, no_band_PCA
                 d2_train_acc, d2_test_acc,
                 data2_train_acc, data2_test_acc,
                 no_band_train_acc, no_band_test_acc,
                 band_train_acc, band_test_acc,
                 norm_band_train_acc, norm_band_test_acc,
                 norm_no_band_train_PCA, norm_no_band_test_PCA,
                 band_train_PCA, band_test_PCA,
                 norm_band_train_PCA, norm_band_test_PCA]

results_roc = [d1_roc, d2_roc,
                data2_roc, no_band_roc,
                band_roc, norm_band_roc,
                norm_no_band_PCA_roc, band_PCA_roc, norm_band_PCA_roc] ##excludes data1

#results_p = [d1_p, d2_p,
#                data2_p, no_band_p,
#                band_p, norm_band_p,
#                norm_no_band_PCA_p, band_PCA_p, norm_band_PCA_p]

#results_r = [d1_r, d2_r,
#                data2_r, no_band_r,
#                band_r, norm_band_r,
#                norm_no_band_PCA_r, band_PCA_r, norm_band_PCA_r]

#results_f1 = [d1_f1, d2_f1,
#                data2_f1, no_band_f1,
#                band_f1, norm_band_f1,
#                norm_no_band_PCA_f1, band_PCA_f1, norm_band_PCA_f1]



results_mean = [mean(d1_train_acc), mean(d1_test_acc),
                 mean(d2_train_acc), mean(d2_test_acc),
                 mean(data2_train_acc), mean(data2_test_acc),
                 mean(no_band_train_acc), mean(no_band_test_acc),
                 mean(band_train_acc), mean(band_test_acc),
                 mean(norm_band_train_acc), mean(norm_band_test_acc),
                 mean(norm_no_band_train_PCA), mean(norm_no_band_test_PCA),
                 mean(band_train_PCA), mean(band_test_PCA),
                 mean(norm_band_train_PCA), mean(norm_band_test_PCA)]

results_roc_mean = [mean(d1_roc), mean(d2_roc),
                      mean(data2_roc), mean(no_band_roc),
                      mean(band_roc), mean(norm_band_roc),
                      mean(norm_no_band_PCA_roc), mean(band_PCA_roc), mean(norm_band_PCA_roc)]


#results_p_mean = [mean(d1_p), mean(d2_p),
#                    mean(data2_p), mean(no_band_p),
#                    mean(band_p), mean(norm_band_p),
#                    mean(norm_no_band_PCA_p), mean(band_PCA_p), mean(norm_band_PCA_p)]

#results_r_mean = [mean(d1_r), mean(d2_r),
#                    mean(data2_r), mean(no_band_r),
#                    mean(band_r), mean(norm_band_r),
#                    mean(norm_no_band_PCA_r), mean(band_PCA_r), mean(norm_band_PCA_r)]

#results_f1_mean = [mean(d1_f1), mean(d2_f1), 
#                    mean(data2_f1), mean(no_band_f1),
#                    mean(band_f1), mean(norm_band_f1),
#                    mean(norm_no_band_PCA_f1), mean(band_PCA_f1), mean(norm_band_PCA_f1)]



index = ['Dem: No Band', '* Dem: No Band',
         'Dem: Band', '* Dem: Band', 
         'Dem + Step: Band', '* Dem + Step: Band',
         'Dem + Step + KEGG: No Band', '* Dem + Step + KEGG: No Band',
         'Dem + Step + KEGG: Band', '* Dem + Step + KEGG: Band',
         'Dem + Step + KEGG: Band, Normalized', '* Dem + Step + KEGG: Band, Normalized',
         'Dem + Step + PCA: No Band, Normalized', '* Dem + Step + PCA: No Band, Normalized',
         'Dem + Step + PCA: Band', '* Dem + Step + PCA: Band',
         'Dem + Step + PCA: Band, Normalized', '* Dem + Step + PCA: Band, Normalized']

roc_index = ['Dem: No Band', 
                'Dem: Band', 
                'Dem + Step: Band',
                'Dem + Step + KEGG: No Band', 
                'Dem + Step + KEGG: Band', 
                'Dem + Step + KEGG: Band, Normalized',
               'Dem + Step + PCA: No Band, Normalized',
               'Dem + Step + PCA: Band', 
               'Dem + Step + PCA: Band, Normalized']

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

svm_table = pd.DataFrame(results, columns = random_state, index = index)
svm_table['Average Model Performance'] = results_mean

svm_roc_table = pd.DataFrame(results_roc, columns = random_state, index = roc_index)
svm_roc_table['Average ROC'] = results_roc_mean

#svm_p_table = pd.DataFrame(results_p, columns = random_state, index = roc_index)
#svm_p_table['Average Precision'] = results_p_mean

#svm_r_table = pd.DataFrame(results_r, columns = random_state, index = roc_index)
#svm_r_table['Average Recall'] = results_r_mean

#svm_f1_table = pd.DataFrame(results_f1, columns = random_state, index = roc_index)
#svm_f1_table['Average f1'] = results_f1_mean

In [36]:
svm_table = svm_table.T
svm_roc_table = svm_roc_table.T
#svm_p_table = svm_p_table.T
#svm_r_table = svm_r_table.T
#svm_f1_table = svm_f1_table.T

svm_table shows the results for all models, the code below selects the top from each category

In [37]:
pd.set_option('display.max_columns', 24)
svm_table = svm_table.copy()
svm_roc_table = svm_roc_table.copy()

result = svm_table.loc[:, ['* Dem: No Band',
         '* Dem: Band', 
        '* Dem + Step: No Band',
        '* Dem + Step: Band',
        '* Dem + Step + KEGG: No Band',
        '* Dem + Step + KEGG: No Band, Normalized',
        '* Dem + Step + KEGG: Band',
        '* Dem + Step + KEGG: Band, Normalized', '* Dem + Step + PCA: No Band',
        '* Dem + Step + PCA: No Band, Normalized',
        '* Dem + Step + PCA: Band',
        '* Dem + Step + PCA: Band, Normalized']]

result_roc = svm_roc_table.loc[:, roc_index]

In [101]:
result.to_excel("svm_model_results.xlsx")
result_roc.to_excel("svm_roc_results.xlsx")

## Extension from original work 

Uses GridSearchCV for hyperparameter tuning

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import statistics


### Demographic - Discretized

In [42]:
train_df, test_df = train_test_split(d1, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

Grid for Randomized Search CV - Hyperparameter Tuning

In [43]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [44]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ........... kernel=poly, gamma=auto, degree=2, C=1, total=   0.2s
[CV] kernel=poly, gamma=auto, degree=2, C=1 ..........................
[CV] ........... kernel=poly, gamma=auto, degree=2, C=1, total=   0.5s
[CV] kernel=rbf, gamma=auto, degree=2, C=1 ...........................
[CV] ............ kernel=rbf, gamma=auto, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=auto, degree=2, C=1 ...........................
[CV] ............ kernel=rbf, gamma=auto, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=auto, degree=2, C=1 ...........................
[CV] ............ kernel=rbf, gamma=auto, degree=2, C=1, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, degree=2, C=1 .......................
[CV] ........ kernel=sigmoid, gamma=auto, degree=2, C=1, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, degree=2, C=1 .......................
[CV] ........ kernel=sigmoid, gamma=auto, degree=2, C=1, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, degree=2, C=1 .......................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  1.4min finished


In [45]:
svm_random.best_params_

{'kernel': 'rbf', 'gamma': 'auto', 'degree': 2, 'C': 175}

In [46]:
#Import svm model
from sklearn import svm
from sklearn import metrics


clf = svm.SVC(C = 175, kernel = 'rbf', gamma = 'auto', degree = 2)
clf.fit(X_train,Y_train)

y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
#print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
#print("Precision:",metrics.precision_score(Y_test, y_pred))
#print("Recall:",metrics.recall_score(Y_test, y_pred))
#print("f1:",metrics.f1_score(Y_test, y_pred))
#print("ROC AUC:",metrics.roc_auc_score(Y_test, y_pred))

In [47]:
random_state = list(range(1,300))


d1_accuracy = []
d1_roc_auc = []

d1_precision = []
d1_recall = []
d1_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(d1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 175, kernel = 'rbf', gamma = 'auto', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    d1_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    d1_precision.append(metrics.precision_score(Y_test, y_pred))
    d1_recall.append(metrics.recall_score(Y_test, y_pred))
    d1_f1.append(metrics.f1_score(Y_test, y_pred))
    #d1_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(d1_accuracy), 2)), '+/-', str(round(statistics.stdev(d1_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(d1_precision), 2)), '+/-', str(round(statistics.stdev(d1_precision), 2)))

#print('Average Recall:', str(round(np.mean(d1_recall), 2)), '+/-', str(round(statistics.stdev(d1_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(d1_f1), 2)), '+/-', str(round(statistics.stdev(d1_f1), 2)))


### Demographic - continuous

In [48]:
train_df, test_df = train_test_split(d2, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [49]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [50]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ..... kernel=sigmoid, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, degree=2, C=350 ....................
[CV] ..... kernel=sigmoid, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=linear, gamma=auto, degree=2, C=350 ......................
[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=linear, gamma=auto, degree=2, C=350 ......................
[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=linear, gamma=auto, degree=2, C=350 ......................
[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    0.3s finished


In [51]:
svm_random.best_params_

{'kernel': 'poly', 'gamma': 'scale', 'degree': 2, 'C': 1}

In [52]:
#Import svm model
from sklearn import svm
from sklearn import metrics


clf = svm.SVC(C = 1, kernel = 'poly', gamma = 'scale', degree = 2)
clf.fit(X_train,Y_train)

y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
#print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
#print("Precision:",metrics.precision_score(Y_test, y_pred))
#print("Recall:",metrics.recall_score(Y_test, y_pred))
#print("f1:",metrics.f1_score(Y_test, y_pred))
#print("ROC AUC:",metrics.roc_auc_score(Y_test, y_pred))

In [53]:
random_state = list(range(1,300))


d2_accuracy = []
d2_roc_auc = []

d2_precision = []
d2_recall = []
d2_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(d2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'poly', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    d2_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    d2_precision.append(metrics.precision_score(Y_test, y_pred))
    d2_recall.append(metrics.recall_score(Y_test, y_pred))
    d2_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(d2_accuracy), 2)), '+/-', str(round(statistics.stdev(d2_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(d2_precision), 2)), '+/-', str(round(statistics.stdev(d2_precision), 2)))

#print('Average Recall:', str(round(np.mean(d2_recall), 2)), '+/-', str(round(statistics.stdev(d2_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(d2_f1), 2)), '+/-', str(round(statistics.stdev(d2_f1), 2)))


### Demographic + Stepcount - discretized

train_df, test_df = train_test_split(data1, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train)

svm_random.best_params_

random_state = list(range(1,300))


data1_accuracy = []
data1_roc_auc = []

data1_precision = []
data1_recall = []
data1_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(data1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'poly', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    data1_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    data1_precision.append(metrics.precision_score(Y_test, y_pred))
    data1_recall.append(metrics.recall_score(Y_test, y_pred))
    data1_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

print('Average Accuracy:', str(round(np.mean(data1_accuracy), 2)), '+/-', str(round(statistics.stdev(data1_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

print('Average Precision:', str(round(np.mean(data1_precision), 2)), '+/-', str(round(statistics.stdev(data1_precision), 2)))

print('Average Recall:', str(round(np.mean(data1_recall), 2)), '+/-', str(round(statistics.stdev(data1_recall), 2)))

print('Average f1 Score:', str(round(np.mean(data1_f1), 2)), '+/-', str(round(statistics.stdev(data1_f1), 2)))


### Demographic + Stepcount - continuous

In [54]:
train_df, test_df = train_test_split(data2, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [55]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [56]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ........ kernel=poly, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=350 .......................
[CV] ........ kernel=poly, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=350 .......................
[CV] ........ kernel=poly, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=350 ........................
[CV] ......... kernel=rbf, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=350 ........................
[CV] ......... kernel=rbf, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=350 ........................
[CV] ......... kernel=rbf, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, degree=2, C=350 ....................
[CV] ..... kernel=sigmoid, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, degree=2, C=350 ....................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    0.3s finished


In [57]:
svm_random.best_params_

{'kernel': 'poly', 'gamma': 'scale', 'degree': 2, 'C': 1}

In [58]:
random_state = list(range(1,300))


data2_accuracy = []
data2_roc_auc = []

data2_precision = []
data2_recall = []
data2_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(data2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'poly', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    data2_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    data2_precision.append(metrics.precision_score(Y_test, y_pred))
    data2_recall.append(metrics.recall_score(Y_test, y_pred))
    data2_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(data2_accuracy), 2)), '+/-', str(round(statistics.stdev(data2_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(data2_precision), 2)), '+/-', str(round(statistics.stdev(data2_precision), 2)))

#print('Average Recall:', str(round(np.mean(data2_recall), 2)), '+/-', str(round(statistics.stdev(data2_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(data2_f1), 2)), '+/-', str(round(statistics.stdev(data2_f1), 2)))


### KEGG (Demographic + Stepcounts + Pathways) - discretized

In [59]:
train_df, test_df = train_test_split(band_final, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [60]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [61]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=  11.5s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.5s remaining:    0.0s


[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   5.5s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   4.9s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.7s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.1s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ........... kernel=rbf, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ........... kernel=rbf, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] .

[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=  15.4s
[CV] kernel=linear, gamma=auto, degree=2, C=350 ......................
[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=   8.6s
[CV] kernel=linear, gamma=auto, degree=2, C=350 ......................
[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=   7.5s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.5s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.1s
[CV] kernel=rbf, gamma=auto, degree=2, C=350 .........................
[CV] .......... kernel=rbf, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=rbf, gamma=auto, degree=2, C=350 .........................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  3.0min finished


In [65]:
svm_random.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 1}

In [66]:
random_state = list(range(1,300))


band_accuracy = []
band_roc_auc = []

band_precision = []
band_recall = []
band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'rbf', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    band_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    band_precision.append(metrics.precision_score(Y_test, y_pred))
    band_recall.append(metrics.recall_score(Y_test, y_pred))
    band_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(band_accuracy), 2)), '+/-', str(round(statistics.stdev(band_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(band_precision), 2)), '+/-', str(round(statistics.stdev(band_precision), 2)))

#print('Average Recall:', str(round(np.mean(band_recall), 2)), '+/-', str(round(statistics.stdev(band_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(band_f1), 2)), '+/-', str(round(statistics.stdev(band_f1), 2)))


### KEGG (Demographic + Stepcounts + Pathways) - discretized, normalized

In [67]:
train_df, test_df = train_test_split(norm_band_final, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [68]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [69]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ...... kernel=sigmoid, gamma=auto, degree=2, C=175, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, degree=2, C=175 .....................
[CV] ...... kernel=sigmoid, gamma=auto, degree=2, C=175, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, degree=2, C=175 .....................
[CV] ...... kernel=sigmoid, gamma=auto, degree=2, C=175, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=350 .....................
[CV] ...... kernel=linear, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=350 .....................
[CV] ...... kernel=linear, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=linear, gamma=scale, degree=2, C=350 .....................
[CV] ...... kernel=linear, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=350 .......................
[CV] ........ kernel=poly, gamma=scale, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=350 .......................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    0.3s finished


In [70]:
svm_random.best_params_

{'kernel': 'poly', 'gamma': 'scale', 'degree': 2, 'C': 1}

In [71]:
random_state = list(range(1,300))


norm_band_accuracy = []
norm_band_roc_auc = []

norm_band_precision = []
norm_band_recall = []
norm_band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'poly', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    norm_band_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    norm_band_precision.append(metrics.precision_score(Y_test, y_pred))
    norm_band_recall.append(metrics.recall_score(Y_test, y_pred))
    norm_band_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(norm_band_accuracy), 2)), '+/-', str(round(statistics.stdev(norm_band_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(norm_band_precision), 2)), '+/-', str(round(statistics.stdev(norm_band_precision), 2)))

#print('Average Recall:', str(round(np.mean(norm_band_recall), 2)), '+/-', str(round(statistics.stdev(norm_band_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(norm_band_f1), 2)), '+/-', str(round(statistics.stdev(norm_band_f1), 2)))


### KEGG (Demographic + Stepcounts + Pathways) - continuous

In [72]:
train_df, test_df = train_test_split(no_band_final, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [73]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [74]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=  19.4s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.4s remaining:    0.0s


[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=  20.9s
[CV] kernel=linear, gamma=scale, degree=2, C=1 .......................
[CV] ........ kernel=linear, gamma=scale, degree=2, C=1, total=   3.7s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=poly, gamma=scale, degree=2, C=1 .........................
[CV] .......... kernel=poly, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ........... kernel=rbf, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] ........... kernel=rbf, gamma=scale, degree=2, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, degree=2, C=1 ..........................
[CV] .

[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total= 2.0min
[CV] kernel=linear, gamma=auto, degree=2, C=350 ......................
[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=  36.7s
[CV] kernel=linear, gamma=auto, degree=2, C=350 ......................
[CV] ....... kernel=linear, gamma=auto, degree=2, C=350, total=   6.0s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=poly, gamma=auto, degree=2, C=350 ........................
[CV] ......... kernel=poly, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=rbf, gamma=auto, degree=2, C=350 .........................
[CV] .......... kernel=rbf, gamma=auto, degree=2, C=350, total=   0.0s
[CV] kernel=rbf, gamma=auto, degree=2, C=350 .........................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 11.8min finished


In [None]:
svm_random.best_params_

In [None]:
random_state = list(range(1,300))


no_band_accuracy = []
no_band_roc_auc = []

no_band_precision = []
no_band_recall = []
no_band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'linear', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    no_band_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    no_band_precision.append(metrics.precision_score(Y_test, y_pred))
    no_band_recall.append(metrics.recall_score(Y_test, y_pred))
    no_band_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(no_band_accuracy), 2)), '+/-', str(round(statistics.stdev(no_band_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(no_band_precision), 2)), '+/-', str(round(statistics.stdev(no_band_precision), 2)))

#print('Average Recall:', str(round(np.mean(no_band_recall), 2)), '+/-', str(round(statistics.stdev(no_band_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(no_band_f1), 2)), '+/-', str(round(statistics.stdev(no_band_f1), 2)))


### KEGG (Demographic + Stepcounts + Pathways) - continuous, normalized

In [None]:
train_df, test_df = train_test_split(norm_no_band_final, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [None]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

In [None]:
svm_random.best_params_

In [None]:
random_state = list(range(1,300))


norm_no_band_accuracy = []
norm_no_band_roc_auc = []

norm_no_band_precision = []
norm_no_band_recall = []
norm_no_band_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_no_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'rbf', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    norm_no_band_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    norm_no_band_precision.append(metrics.precision_score(Y_test, y_pred))
    norm_no_band_recall.append(metrics.recall_score(Y_test, y_pred))
    norm_no_band_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(norm_no_band_accuracy), 2)), '+/-', str(round(statistics.stdev(norm_no_band_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(norm_no_band_precision), 2)), '+/-', str(round(statistics.stdev(norm_no_band_precision), 2)))

#print('Average Recall:', str(round(np.mean(norm_no_band_recall), 2)), '+/-', str(round(statistics.stdev(norm_no_band_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(norm_no_band_f1), 2)), '+/-', str(round(statistics.stdev(norm_no_band_f1), 2)))


### PCA (Demographic + Stepcounts + Pathways) - discretized

band_PCA = PCA + Demographic + stepcount as discrete variables
norm_band_PCA = PCA + Demographic + stepcount, Normalized as discrete variables

stalled

### PCA (Demographic + Stepcounts + Pathways) - discretized, normalized

In [None]:
train_df, test_df = train_test_split(norm_band_PCA, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [None]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

In [None]:
svm_random.best_params_

In [None]:
random_state = list(range(1,300))


norm_band_PCA_accuracy = []
norm_band_PCA_roc_auc = []

norm_band_PCA_precision = []
norm_band_PCA_recall = []
norm_band_PCA_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'poly', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    norm_band_PCA_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    norm_band_PCA_precision.append(metrics.precision_score(Y_test, y_pred))
    norm_band_PCA_recall.append(metrics.recall_score(Y_test, y_pred))
    norm_band_PCA_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(norm_band_PCA_accuracy), 2)), '+/-', str(round(statistics.stdev(norm_band_PCA_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(norm_band_PCA_precision), 2)), '+/-', str(round(statistics.stdev(norm_band_PCA_precision), 2)))

#print('Average Recall:', str(round(np.mean(norm_band_PCA_recall), 2)), '+/-', str(round(statistics.stdev(norm_band_PCA_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(norm_band_PCA_f1), 2)), '+/-', str(round(statistics.stdev(norm_band_PCA_f1), 2)))



### PCA (Demographic + Stepcounts + Pathways) - continuous

train_df, test_df = train_test_split(no_band_PCA, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train)

svm_random.best_params_

random_state = list(range(1,300))


no_band_PCA_accuracy = []
no_band_PCA_roc_auc = []

no_band_PCA_precision = []
no_band_PCA_recall = []
no_band_PCA_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 1, kernel = 'rbf', gamma = 'scale', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    no_band_PCA_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    no_band_PCA_precision.append(metrics.precision_score(Y_test, y_pred))
    no_band_PCA_recall.append(metrics.recall_score(Y_test, y_pred))
    no_band_PCA_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

print('Average Accuracy:', str(round(np.mean(no_band_PCA_accuracy), 2)), '+/-', str(round(statistics.stdev(no_band_PCA_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(band_PCA_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

print('Average Precision:', str(round(np.mean(no_band_PCA_precision), 2)), '+/-', str(round(statistics.stdev(no_band_PCA_precision), 2)))

print('Average Recall:', str(round(np.mean(no_band_PCA_recall), 2)), '+/-', str(round(statistics.stdev(no_band_PCA_recall), 2)))

print('Average f1 Score:', str(round(np.mean(no_band_PCA_f1), 2)), '+/-', str(round(statistics.stdev(no_band_PCA_f1), 2)))


### PCA (Demographic + Stepcounts + Pathways) - continuous, normalized

In [None]:
train_df, test_df = train_test_split(norm_no_band_PCA, test_size=0.2, random_state= 42)
    
X_train = train_df.drop('Weight_loss_band', axis=1)
Y_train = train_df['Weight_loss_band']  
    
X_test  = test_df.drop('Weight_loss_band', axis=1)
Y_test = test_df['Weight_loss_band']  

In [None]:
# Penalty parameter C of the error term.

C = [int(x) for x in np.linspace(start = 1, stop = 350, num = 3)]

# Specifies the kernel type to be used in the algorithm

kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.

degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 1)]

# Gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

gamma = ['scale', 'auto']

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC()

random_grid = {'C': C,
               'kernel': kernel,
               'degree': degree,
               'gamma': gamma}

svm_random= RandomizedSearchCV(estimator = clf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, iid = True)

svm_random.fit(X_train, Y_train);

In [None]:
svm_random.best_params_

In [None]:
random_state = list(range(1,300))


norm_no_band_PCA_accuracy = []
norm_no_band_PCA_roc_auc = []

norm_no_band_PCA_precision = []
norm_no_band_PCA_recall = []
norm_no_band_PCA_f1 = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(C = 175, kernel = 'poly', gamma = 'auto', degree = 2)
    clf.fit(X_train,Y_train)

    y_pred = clf.predict(X_test)
    
    
    norm_no_band_PCA_accuracy.append(metrics.accuracy_score(Y_test, y_pred))
    norm_no_band_PCA_precision.append(metrics.precision_score(Y_test, y_pred))
    norm_no_band_PCA_recall.append(metrics.recall_score(Y_test, y_pred))
    norm_no_band_PCA_f1.append(metrics.f1_score(Y_test, y_pred))
    #d2_roc_auc.append(metrics.roc_auc_score(Y_test, y_pred))

#print('Average Accuracy:', str(round(np.mean(norm_no_band_PCA_accuracy), 2)), '+/-', str(round(statistics.stdev(norm_no_band_PCA_accuracy), 2))) 

#print('Average ROC AUC Score:', np.mean(d1_roc_auc), '+/-', statistics.stdev(d1_roc_auc))

#print('Average Precision:', str(round(np.mean(norm_no_band_PCA_precision), 2)), '+/-', str(round(statistics.stdev(norm_no_band_PCA_precision), 2)))

#print('Average Recall:', str(round(np.mean(norm_no_band_PCA_recall), 2)), '+/-', str(round(statistics.stdev(norm_no_band_PCA_recall), 2)))

#print('Average f1 Score:', str(round(np.mean(norm_no_band_PCA_f1), 2)), '+/-', str(round(statistics.stdev(norm_no_band_PCA_f1), 2)))


### Accuracy Tables

In [70]:
results_mean = [str(round(np.mean(d1_accuracy), 2)), #excludes data1, norm_no_band, no_band_PCA
                 str(round(np.mean(d2_accuracy), 2)),
                 str(round(np.mean(data2_accuracy), 2)),
                 str(round(np.mean(band_accuracy), 2)),
                 str(round(np.mean(no_band_accuracy), 2)),
                 str(round(np.mean(norm_no_band_accuracy), 2)),
                 str(round(np.mean(norm_no_band_PCA_accuracy), 2)),
                 str(round(np.mean(norm_band_PCA_accuracy), 2))]

#results_roc = [d1_roc, d2_roc,
#                data2_roc, no_band_roc,
#                band_roc, norm_band_roc,
 #               norm_no_band_PCA_roc, band_PCA_roc, norm_band_PCA_roc] ##excludes data1

results_precision_mean = [str(round(np.mean(d1_precision), 2)), 
                         str(round(np.mean(d2_precision), 2)),
                         str(round(np.mean(data2_precision), 2)),
                         str(round(np.mean(band_precision), 2)),
                         str(round(np.mean(no_band_precision), 2)),
                         str(round(np.mean(norm_no_band_precision), 2)),
                         #str(round(np.mean(norm_band_precision), 2)),
                         str(round(np.mean(norm_no_band_PCA_precision), 2)),
                         str(round(np.mean(norm_band_PCA_precision), 2))]


#results_r = [d1_r, d2_r,
#                data2_r, no_band_r,
#                band_r, norm_band_r,
#                norm_no_band_PCA_r, band_PCA_r, norm_band_PCA_r]

results_recall_mean = [str(round(np.mean(d1_recall), 2)), 
                         str(round(np.mean(d2_recall), 2)),
                         str(round(np.mean(data2_recall), 2)),
                         str(round(np.mean(band_recall), 2)),
                         str(round(np.mean(no_band_recall), 2)),
                         #str(round(np.mean(norm_no_band_recall), 2)),
                         str(round(np.mean(norm_band_recall), 2)),
                         str(round(np.mean(norm_no_band_PCA_recall), 2)),
                         str(round(np.mean(norm_band_PCA_recall), 2))]

results_f1_mean = [str(round(np.mean(d1_f1), 2)), 
                         str(round(np.mean(d2_f1), 2)),
                         str(round(np.mean(data2_f1), 2)),
                         str(round(np.mean(band_f1), 2)),
                         str(round(np.mean(no_band_f1), 2)),
                         #str(round(np.mean(norm_no_band_f1), 2)),
                         str(round(np.mean(norm_band_f1), 2)),
                         str(round(np.mean(norm_no_band_PCA_f1), 2)),
                         str(round(np.mean(norm_band_PCA_f1), 2))]



#results_f1 = [d1_f1, d2_f1,
#                data2_f1, no_band_f1,
#                band_f1, norm_band_f1,
#                norm_no_band_PCA_f1, band_PCA_f1, norm_band_PCA_f1]



#results_mean = [mean(d1_train_acc), mean(d1_test_acc),
##                 mean(d2_train_acc), mean(d2_test_acc),
#                 mean(data2_train_acc), mean(data2_test_acc),
#                 mean(no_band_train_acc), mean(no_band_test_acc),
#                 mean(band_train_acc), mean(band_test_acc),
 #                mean(norm_band_train_acc), mean(norm_band_test_acc),
#                 mean(norm_no_band_train_PCA), mean(norm_no_band_test_PCA),
#                 mean(band_train_PCA), mean(band_test_PCA),
#                 mean(norm_band_train_PCA), mean(norm_band_test_PCA)]

#results_roc_mean = [mean(d1_roc), mean(d2_roc),
#                      mean(data2_roc), mean(no_band_roc), mean(norm_no_band_roc),
#                      mean(band_roc), mean(norm_band_roc),
#                      mean(norm_no_band_PCA_roc), mean(band_PCA_roc), mean(norm_band_PCA_roc)]


#results_p_mean = [mean(d1_p), mean(d2_p),
#                    mean(data2_p), mean(no_band_p),
#                    mean(band_p), mean(norm_band_p),
#                    mean(norm_no_band_PCA_p), mean(band_PCA_p), mean(norm_band_PCA_p)]

#results_r_mean = [mean(d1_r), mean(d2_r),
#                    mean(data2_r), mean(no_band_r),
#                    mean(band_r), mean(norm_band_r),
#                    mean(norm_no_band_PCA_r), mean(band_PCA_r), mean(norm_band_PCA_r)]

#results_f1_mean = [mean(d1_f1), mean(d2_f1), 
#                    mean(data2_f1), mean(no_band_f1),
#                    mean(band_f1), mean(norm_band_f1),
#                    mean(norm_no_band_PCA_f1), mean(band_PCA_f1), mean(norm_band_PCA_f1)]



index = ['Dem: No Band',
         'Dem: Band', 
         'Dem + Step: Band',
         'Dem + Step + KEGG: No Band',
         'Dem + Step + KEGG: Band',
         'Dem + Step + KEGG: Band, Normalized',
         'Dem + Step + PCA: No Band, Normalized',
         'Dem + Step + PCA: Band, Normalized']

#roc_index = ['Dem: No Band', 
#                'Dem: Band', 
#                'Dem + Step: Band',
#                'Dem + Step + KEGG: No Band', 
#                'Dem + Step + KEGG: Band', 
#                'Dem + Step + KEGG: Band, Normalized',
#               'Dem + Step + PCA: No Band, Normalized',
#               'Dem + Step + PCA: Band', 
#               'Dem + Step + PCA: Band, Normalized']

measure = ['Accuracy', 'Precision', 'Recall', 'f1']

svm_table = pd.DataFrame([results_mean, results_precision_mean, results_recall_mean, results_f1_mean], columns = index, index = measure)
#svm_table['Average Model Performance'] = results_mean

#svm_roc_table = pd.DataFrame(results_roc, columns = random_state, index = roc_index)
#svm_roc_table['Average ROC'] = results_roc_mean

#svm_p_table = pd.DataFrame(results_p, columns = random_state, index = roc_index)
#svm_p_table['Average Precision'] = results_p_mean

#svm_r_table = pd.DataFrame(results_r, columns = random_state, index = roc_index)
#svm_r_table['Average Recall'] = results_r_mean

#svm_f1_table = pd.DataFrame(results_f1, columns = random_state, index = roc_index)
#svm_f1_table['Average f1'] = results_f1_mean

In [74]:
svm_table = svm_table.T
svm_table

Unnamed: 0,Accuracy,Precision,Recall,f1
Dem: No Band,0.68,0.53,0.33,0.38
Dem: Band,0.71,0.6,0.26,0.34
Dem + Step: Band,0.67,0.52,0.35,0.38
Dem + Step + KEGG: No Band,0.67,0.0,0.0,0.0
Dem + Step + KEGG: Band,0.67,0.0,0.0,0.0
"Dem + Step + KEGG: Band, Normalized",0.64,0.08,0.09,0.09
"Dem + Step + PCA: No Band, Normalized",0.63,0.44,0.35,0.36
"Dem + Step + PCA: Band, Normalized",0.66,0.37,0.15,0.19


In [72]:
svm_table.to_excel("svm_results.xlsx")
