In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("drug200.csv")

In [3]:
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [4]:
data.shape

(200, 6)

In [5]:
set(data['Drug'])

{'drugA', 'drugB', 'drugC', 'drugX', 'drugY'}

In [6]:
object_columns = data.select_dtypes(include = ['object']).columns

In [7]:
object_columns

Index(['Sex', 'BP', 'Cholesterol', 'Drug'], dtype='object')

In [8]:
# Convert categorical labels to numerical using LabelEncoder
from sklearn.preprocessing import LabelEncoder

for col in object_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [9]:
data

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4
...,...,...,...,...,...,...
195,56,0,1,0,11.567,2
196,16,1,1,0,12.006,2
197,52,1,2,0,9.894,3
198,23,1,2,1,14.020,3


In [10]:
# Split the data into features (X) and target variable (y)
X = data.drop('Drug',axis = 1)
y = data['Drug']

In [11]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [12]:
xtrain.shape

(160, 5)

In [13]:
xtest.shape

(40, 5)

In [14]:
from sklearn.svm import SVC

# Create an SVM model
svm_model = SVC()

In [15]:
# Define the hyperparameter grid for Grid Search
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf']}

In [16]:
from sklearn.model_selection import GridSearchCV

# Use Grid Search to find the best parameters for the SVM model
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy', verbose=2)
grid_search.fit(xtrain, ytrain)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ........................C=1, gamma=0.01,

In [17]:
# Get the best SVM model
best_svm_model = grid_search.best_estimator_

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [18]:
# Train the best model on the entire training set
best_svm_model.fit(xtrain, ytrain)

In [19]:
# Make predictions on the test set
ypred = best_svm_model.predict(xtest)

In [20]:
from sklearn.metrics import accuracy_score

# Evaluate the accuracy of the model
accuracy = accuracy_score(ytest, ypred)
print("Accuracy:", accuracy)

Accuracy: 0.875


In [21]:
import pickle

with open("svc_model.pkl", "wb") as f:
    pickle.dump(best_svm_model, f)