Question 1: A) The given dataset is a multiclass classification problem. Multiclass classification is a type of supervised learning problem in which we need to classify instances into one of three or more classes. In this dataset, we have 42 different types of diseases that can be predicted based on 132 parameters.

For example, if we have a dataset of images of fruits and we want to classify them into different types of fruits such as apples, oranges, and bananas, then it would be a multiclass classification problem. Similarly, if we have a dataset of emails and we want to classify them into different categories such as spam, promotional, and personal, then it would also be a multiclass classification problem.

B)

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
train_df = pd.read_csv('./../Disease_data/Training.csv')

# Remove unnecessary columns
train_df.drop(['Unnamed: 133'], axis=1, inplace=True)

# Separate categorical values from the "Prognosis" column
prognosis = train_df['prognosis']
train_df.drop(['prognosis'], axis=1, inplace=True)

# Count all columns containing 1 in their rows
count_ones = train_df.sum(axis=0)

count_ones

itching                 678
skin_rash               786
nodal_skin_eruptions    108
continuous_sneezing     222
shivering               108
                       ... 
small_dents_in_nails    114
inflammatory_nails      114
blister                 114
red_sore_around_nose    114
yellow_crust_ooze       114
Length: 132, dtype: int64

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
#train_df.drop(['Unnamed:133'], axis=1, inplace=True)
# Create the model
dtc = DecisionTreeClassifier(random_state=42)

# Train the model
dtc.fit(train_df, prognosis)

# Make predictions on the test set
test_df = pd.read_csv('./../Disease_data/Testing.csv')
test_prognosis = test_df['prognosis']
test_df.drop(['prognosis'], axis=1, inplace=True)
dtc_predictions = dtc.predict(test_df)
# Evaluate the accuracy of the model
dtc_accuracy = accuracy_score(test_prognosis, dtc_predictions)
print("Decision Tree Classifier accuracy:", dtc_accuracy)

Decision Tree Classifier accuracy: 0.9761904761904762


In [3]:
from sklearn.ensemble import RandomForestClassifier

# Create the model
rfc = RandomForestClassifier(random_state=42)
# Train the model
rfc.fit(train_df, prognosis)


# Make predictions on the test set
rfc_predictions = rfc.predict(test_df)

# Evaluate the accuracy of the model
rfc_accuracy = accuracy_score(test_prognosis, rfc_predictions)
print("Random Forest Classifier accuracy:", rfc_accuracy)


Random Forest Classifier accuracy: 0.9761904761904762


In [4]:
from sklearn.svm import SVC

# Create the model
svm = SVC(random_state=42)

# Train the model
svm.fit(train_df, prognosis)

# Make predictions on the test set
svm_predictions = svm.predict(test_df)

# Evaluate the accuracy of the model
svm_accuracy = accuracy_score(test_prognosis, svm_predictions)
print("Support Vector Machine Classifier accuracy:", svm_accuracy)


Support Vector Machine Classifier accuracy: 1.0


C)

In [28]:
import pandas as pd

# Read the dataset
df_train = pd.read_csv('./../Disease_data/Training.csv')
# Remove unnecessary columns
df_train.drop(['Unnamed: 133'], axis=1, inplace=True)
df_test = pd.read_csv('./../Disease_data/Testing.csv')
# Extract the symptoms columns
symptoms_train = df_train.columns[:-1]
symptoms_test = df_test.columns[:-1]

# Create dummy variables for each symptom
for symptom in symptoms_train:
    df_train[symptom] = df_train[symptom].apply(lambda x: 1 if x == 1 else 0)
for symptom in symptoms_test:
    df_test[symptom] = df_test[symptom].apply(lambda x: 1 if x == 1 else 0)

# Convert the disease column into categorical values
df_train['prognosis'] = pd.Categorical(df_train['prognosis'])
df_test['prognosis'] = pd.Categorical(df_test['prognosis'])

# Split the dataset into X and y
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]
X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]

In [37]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred_rf = model.predict(df_test.drop('prognosis', axis=1))


In [36]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(df_test.drop('prognosis', axis=1))

In [35]:
# Support Vector Machines
from sklearn.svm import SVC
model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(df_test.drop('prognosis', axis=1))

In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# accuracy score
y_true = df_test['prognosis']
acc_rf = accuracy_score(y_true, y_pred_rf)
print('Accuracy (Random Forest): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = accuracy_score(y_true, y_pred_lr)
print('Accuracy (Logistic Regression): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = accuracy_score(y_true, y_pred_svm)
print('Accuracy (Support Vector Machine): {:.2f}%'.format(acc_rf * 100))

# precision score
print('Precision Score')
acc_rf = precision_score(y_true, y_pred_rf, average='weighted')
print('Accuracy (Random Forest): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = precision_score(y_true, y_pred_lr, average='weighted')
print('Accuracy (Logistic Regression): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = precision_score(y_true, y_pred_svm, average='weighted')
print('Accuracy (Support Vector Machine): {:.2f}%'.format(acc_rf * 100))

# recall score
print('Recall Score')
acc_rf = recall_score(y_true, y_pred_rf, average='weighted')
print('Accuracy (Random Forest): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = recall_score(y_true, y_pred_lr, average='weighted')
print('Accuracy (Logistic Regression): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = recall_score(y_true, y_pred_svm, average='weighted')
print('Accuracy (Support Vector Machine): {:.2f}%'.format(acc_rf * 100))

# f1 score
print('F1 Score')
acc_rf = f1_score(y_true, y_pred_rf, average='weighted')
print('Accuracy (Random Forest): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = f1_score(y_true, y_pred_lr, average='weighted')
print('Accuracy (Logistic Regression): {:.2f}%'.format(acc_rf * 100))
y_true = df_test['prognosis']
acc_rf = f1_score(y_true, y_pred_svm, average='weighted')
print('Accuracy (Support Vector Machine): {:.2f}%'.format(acc_rf * 100))

Accuracy (Random Forest): 97.62%
Accuracy (Logistic Regression): 100.00%
Accuracy (Support Vector Machine): 100.00%
Precision Score
Accuracy (Random Forest): 98.81%
Accuracy (Logistic Regression): 100.00%
Accuracy (Support Vector Machine): 100.00%
Recall Score
Accuracy (Random Forest): 97.62%
Accuracy (Logistic Regression): 100.00%
Accuracy (Support Vector Machine): 100.00%
F1 Score
Accuracy (Random Forest): 97.62%
Accuracy (Logistic Regression): 100.00%
Accuracy (Support Vector Machine): 100.00%


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Calculate the AUC-ROC curve
n_classes = len(np.unique(df_test))
y_test_binary = label_binarize(df_test, classes=np.unique(df_test))
y_pred_proba = clf.predict_proba(df_test)
fpr,tpr,thresholds = roc_curve(df_test, y_pred, pos_label=2) 
roc_auc = auc(fpr, tpr)

# Plot the AUC-ROC curve
plt.figure()
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()