### Part A

In [None]:
# HERE YOU WILL WRITE CODE TO TEST A NUMBER OF PREDICTORS
# AND FINALLY CHOOSE AND TRAIN THE PREDICTOR THAT YOU WILL BE USING FOR PART B

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np



In [2]:
# Mount data file from google drive to colab workspace
from google.colab import drive 
drive.mount('/content/drive')
path = '/content/drive/MyDrive/machine_learning/CE802_P2_Data.csv'

Mounted at /content/drive


In [3]:
# Read csv file
df = pd.read_csv(path) 

In [15]:
# Split training and test, while dropping the Class Column
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Replace missing data with the KNNImputer class
imputer = KNNImputer(n_neighbors=5)

# Fit the imputer to the data and transform the data
X_imputed = imputer.fit_transform(X)

# Create a new DataFrame with the imputed values
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Check if there are any missing values remaining in the dataset
print(X_imputed_df.isnull().sum())

# Replace the F20 column in the original DataFrame with the imputed values
df['F20'] = X_imputed_df['F20']

F1     0
F2     0
F3     0
F4     0
F5     0
F6     0
F7     0
F8     0
F9     0
F10    0
F11    0
F12    0
F13    0
F14    0
F15    0
F16    0
F17    0
F18    0
F19    0
F20    0
dtype: int64


In [None]:
# Check the number for missing values
missing_values = df.isnull().sum() 
print (missing_values)

In [None]:
df.info()

In [None]:
# Check if the missing data have been filled in
print(df.head(10))

In [None]:
# Print the shapes of the train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
# Create a decision tree classifier with default hyperparameters
dt_classifier = DecisionTreeClassifier()

# Fit the decision tree classifier to the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_classifier.predict(X_test)

# Evaluate the performance of the decision tree classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

In [None]:
# Create a SVC
svc = SVC()

# Fit the model to the training data
svc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

In [None]:
# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

In [None]:
# Create a KNN model
model = KNeighborsClassifier(n_neighbors=10)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

In [None]:
# Create a Gaussian Naive Bayes model
model = GaussianNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

In [None]:
# Plot chat to visualize the results

# Define the data
models = ['Decision Tree', 'SVC', 'Logistic Regression', 'KNN', 'Gaussian Naive Bayes']
accuracy = [0.835, 0.645, 0.72, 0.665, 0.55]
precision = [0.8478260869565217, 0.5902777777777778, 0.7157894736842105, 0.6704545454545454, 0.52]
recall = [0.8041237113402062, 0.8762886597938144, 0.7010309278350515, 0.6082474226804123, 0.9381443298969072]
f1_score = [0.8253968253968254, 0.7053941908713695, 0.7083333333333334, 0.6378378378378377, 0.6691176470588236]

# Set the width of the bars
barWidth = 0.2

# Set the x-axis values
r1 = np.arange(len(models))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]

# Create the bars
plt.bar(r1, accuracy, color='blue', width=barWidth, label='Accuracy')
plt.bar(r2, precision, color='red', width=barWidth, label='Precision')
plt.bar(r3, recall, color='green', width=barWidth, label='Recall')
plt.bar(r4, f1_score, color='orange', width=barWidth, label='F1 Score')

# Set the x-axis labels
plt.xticks([r + barWidth for r in range(len(models))], models, rotation=45, ha='right')

# Set the y-axis label and title
plt.ylabel('Scores')
plt.title('Evaluation Metrics of Machine Learning Models')

# Add a legend
plt.legend()

# Display the histogram
plt.show()


### Part B

In [27]:
# HERE YOU WILL USE THIS TEMPLATE TO SAVE THE PREDICTIONS ON THE TEST SET


# Load the test data
test_file_path = '/content/drive/MyDrive/machine_learning/CE802_P2_Test.csv'
test_df = pd.read_csv(test_file_path)

# Make sure you work on a copy
test_data = test_df.iloc[:,:-1].copy()

# Replace missing data with the KNNImputer class
imputer = KNNImputer(n_neighbors=5)
test_df[['F20']] = imputer.fit_transform(test_df[['F20']])

predicted = dt_classifier.predict(test_df.iloc[:,:-1])

# Replace the last (empty) column with your prediction
test_df.iloc[:,-1] = predicted

# Save to the destination file
test_df.to_csv('/content/drive/MyDrive/machine_learning/CE802_P2_Test.csv', index=False, float_format='%.8g')

# IMPORTANT!! Make sure only the last column has changed
assert pd.read_csv('/content/drive/MyDrive/machine_learning/CE802_P2_Test.csv').iloc[:,:-1].equals(pd.read_csv('/content/drive/MyDrive/machine_learning/CE802_P2_Test.csv').iloc[:,:-1])