In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Redback_B/cardio_data_processed.csv')

In [4]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category,bp_category_encoded
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,Hypertension Stage 1,Hypertension Stage 1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,Hypertension Stage 2,Hypertension Stage 2
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805,Hypertension Stage 1,Hypertension Stage 1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,Hypertension Stage 2,Hypertension Stage 2
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177,Normal,Normal


In [5]:
df = df.drop(["id", "age", "bp_category", "smoke", "alco", "height", "weight"], axis=1)
# SVM is slow:
# Drop id: not meaningful
# Drop age in days, b/c there is age_years
# Drop bp_category, b/c bp_category_encoded
# Drop "smoke", "alco": no much difference in cardiovascular diseases in LR
# Drop "height", "weight": b/c bmi

In [7]:
# Convert categorical variable to numeric variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['bp_category_encoded'] = label_encoder.fit_transform(df['bp_category_encoded'])

In [8]:
df.head()

Unnamed: 0,gender,ap_hi,ap_lo,cholesterol,gluc,active,cardio,age_years,bmi,bp_category_encoded
0,2,110,80,1,1,1,0,50,21.96712,1
1,1,140,90,3,1,1,1,55,34.927679,2
2,1,130,70,3,1,0,1,51,23.507805,1
3,2,150,100,1,1,1,1,48,28.710479,2
4,1,100,60,1,1,0,0,47,23.011177,3


In [9]:
import time

# Start the timer
start_time = time.time()

In [10]:
# Import the Support Vector Machine
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

# Define X and y
y = df.cardio
X = df.drop('cardio', axis=1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=456)

In [11]:
from sklearn.preprocessing import StandardScaler

# List of numeric columns
numeric_columns = ['age_years', 'bmi', 'ap_hi', 'ap_lo']

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler on the training set and transform both the training and test sets
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

In [12]:
# Create a SVM classifier
svm = SVC(kernel='linear', C=1.0)
scores = cross_val_score(svm, X_train, y_train, cv=5)

# Evaluate the performance of the model using cross-validation
print('Cross-validation scores: {}'.format(scores))
print('Average cross-validation score: {:.2f}'.format(scores.mean()))

Cross-validation scores: [0.72101791 0.72667295 0.71682899 0.71920821 0.72360704]
Average cross-validation score: 0.72


In [None]:
# Use grid_search to find the optimal C
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for the SVM model
param_grid = {'C': [0.1, 1, 5]}

# Define the SVM model with a linear kernel
svm = SVC(kernel='linear')

# Perform grid search cross-validation to find the optimal value of C
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Print the results
print("Best C:", grid_search.best_params_['C'])
print("Best score:", grid_search.best_score_)

Best C: 5
Best score: 0.7255304466450095


In [None]:
# Fit the classifier to the training data, using the best C value
svm = SVC(kernel='linear', C=0.1)
svm.fit(X_train, y_train)

# Create the predicted tags: pred
pred = svm.predict(X_test)

# Print accuracy score and confusion matrix on test set
print('Accuracy on the test set: ', accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred)/len(y_test))

Accuracy on the test set:  0.7234385690548334
[[0.43167823 0.07462614]
 [0.20193529 0.29176034]]


In [None]:
# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Print the elapsed time
print("Elapsed time: {:.2f} seconds".format(elapsed_time))

Elapsed time: 4453.37 seconds
