# Practical 6

### 1. Load the Dataset

In [1]:
import pandas as pd

df = pd.read_csv('./Datasets/drug200.csv')
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


### 2. Cleaning

In [2]:
df.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

### 3. Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder

# encoding categorical data
# Initialize the LabelEncoder
le = LabelEncoder()

# Encode categorical columns
# One-hot encode the 'Sex' column
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)
df['BP'] = le.fit_transform(df['BP'])
df['Cholesterol'] = le.fit_transform(df['Cholesterol'])
df['Drug'] = le.fit_transform(df['Drug'])

df.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Sex_M
0,23,0,0,25.355,4,False
1,47,1,0,13.093,2,True
2,47,1,0,10.114,2,True
3,28,2,0,7.798,3,False
4,61,1,0,18.043,4,False


### 4. Standardization

In [4]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Select the numerical columns
numerical_cols = ['Age', 'Na_to_K']

# Apply standardization only to numerical columns
data_standardized = df.copy()
data_standardized[numerical_cols] = scaler.fit_transform(data_standardized[numerical_cols])

# Display the first few rows of the standardized data
print(data_standardized.head())

        Age  BP  Cholesterol   Na_to_K  Drug  Sex_M
0 -1.291591   0            0  1.286522     4  False
1  0.162699   1            0 -0.415145     2   True
2  0.162699   1            0 -0.828558     2   True
3 -0.988614   2            0 -1.149963     3  False
4  1.011034   1            0  0.271794     4  False


### 5. Train - Test Split

In [5]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('Drug', axis=1)
y = df['Drug']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 6. Training the Data (Applying Models)

#### (i) KNN

In [6]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN Classifier model
model_knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the training data
model_knn.fit(X_train, y_train)

# Make predictions on the test data
knn_pred = model_knn.predict(X_test)

# Evaluate the model
knn_report = classification_report(y_test, knn_pred)

# Accuracy
knn_accuracy = accuracy_score(y_test, knn_pred)


print("KNN Classification Report:\n", knn_report)
print(f"KNN Accuracy: {knn_accuracy * 100:.2f}%")

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.50      0.55         6
           1       0.33      0.67      0.44         3
           2       1.00      0.20      0.33         5
           3       0.54      0.64      0.58        11
           4       1.00      1.00      1.00        15

    accuracy                           0.70        40
   macro avg       0.69      0.60      0.58        40
weighted avg       0.76      0.70      0.69        40

KNN Accuracy: 70.00%


#### (ii) Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Train the model on the training data
decision_tree.fit(X_train, y_train)

# Make predictions on the test data
dt_preds = decision_tree.predict(X_test)

# Evaluate the model
dt_report = classification_report(y_test, dt_preds)

# Accuracy
dt_accuracy = accuracy_score(y_test, dt_preds)

print("Decision Tree Classification Report:\n", dt_report)
print(f"Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00        11
           4       1.00      1.00      1.00        15

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

Decision Tree Accuracy: 100.00%


#### (iii) SVM

In [8]:
from sklearn.svm import SVC

# Initialize the Support Vector Classifier
svm = SVC(random_state=42)

# Train the model on the training data
svm.fit(X_train, y_train)

# Make predictions on the test data
svm_preds = svm.predict(X_test)

# Evaluate the model
svm_report = classification_report(y_test, svm_preds, zero_division=1)

# Accuracy
svm_accuracy = accuracy_score(y_test, svm_preds)

print("SVM Classification Report:\n", svm_report)
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")

SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00         6
           1       1.00      0.00      0.00         3
           2       1.00      0.00      0.00         5
           3       0.43      0.91      0.59        11
           4       0.88      1.00      0.94        15

    accuracy                           0.62        40
   macro avg       0.86      0.38      0.31        40
weighted avg       0.80      0.62      0.51        40

SVM Accuracy: 62.50%


#### (iv) Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Scale the training and test data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression model
logistic_regression = LogisticRegression(max_iter=500, random_state=42)

# Train the model on the scaled training data
logistic_regression.fit(X_train_scaled, y_train)

# Make predictions on the scaled test data
lr_preds = logistic_regression.predict(X_test_scaled)

# Evaluate the model with zero_division parameter
lr_report = classification_report(y_test, lr_preds, zero_division=1)

# Accuracy
lr_accuracy = accuracy_score(y_test, lr_preds)

print("Logistic Regression Classification Report:\n", lr_report)
print(f"Logistic Regression Accuracy: {lr_accuracy * 100:.2f}%")

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       1.00      1.00      1.00         3
           2       1.00      0.80      0.89         5
           3       1.00      0.91      0.95        11
           4       0.88      0.93      0.90        15

    accuracy                           0.93        40
   macro avg       0.95      0.93      0.93        40
weighted avg       0.93      0.93      0.93        40

Logistic Regression Accuracy: 92.50%


## The best suitable model for the given dataset is Logistic Regression with the highest accuracy of 92.50%