In [None]:
# November 25, 2023
# CSC461 – Assignment3 – Machine Learning
# Zohaib Murtaza
# CIIT/FA21-BSE-138/LHR
# Using the provided gender prediction dateset, using required learning models to find the accuracy of learning models with variation in the test and train split and excluding some significant attributes from the training data.

In [None]:
#install scikitplot
!pip install scikit-plot

Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [None]:
#import libraries
from sklearn import preprocessing
import pandas as pd

#import different ML classifiers
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

#import ML evaluation metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics, model_selection

In [None]:
#import library for accesssing Google Drive from Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
gender_data = pd.read_csv('/content/drive/My Drive/DS-Datasets/gender-prediction.csv')

#With added rows for question 4
#gender_data = pd.read_csv('/content/drive/My Drive/DS-Datasets/gender-prediction_10-added.csv')

In [None]:
#Extracting input and outuput columns from csv

#For question 1, only dropping gender
input_attr = gender_data.drop('gender', axis=1)

#For question 2, dropping hair_length, shoe size and gender.
#input_attr = gender_data.drop(['hair_length', 'shoe_size', 'gender'], axis=1)

output_attr = gender_data['gender']

In [None]:
#Encoding Data
label_encoder = preprocessing.LabelEncoder()
input_encoded = input_attr.apply(label_encoder.fit_transform)
output_encoded = label_encoder.fit_transform(output_attr)

In [None]:
#Making train/test split
input_train, input_test, output_train, output_test = train_test_split(input_encoded, output_encoded, test_size = 0.33, random_state = 2)

#For question 2, changing split to 80/20
#input_train, input_test, output_train, output_test = train_test_split(input_encoded, output_encoded, test_size = 0.20, random_state = 2)

In [None]:
#Using Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(input_train,output_train)
prediction = lr_model.predict(input_test)
model_accuracy = accuracy_score(output_test, prediction)*100
print("LR Accuracy: ", model_accuracy)
incorrect_lr = (output_test != prediction).sum()
print("Incorrectly Classified Instances (LR): ", incorrect_lr)

LR Accuracy:  92.5
Incorrectly Classified Instances (LR):  3


In [None]:
# Using Support Vector Machines (SVM)
svm_model = SVC()
svm_model.fit(input_train, output_train)
svm_prediction = svm_model.predict(input_test)
svm_accuracy = accuracy_score(output_test, svm_prediction) * 100
print("SVM Accuracy: ", svm_accuracy)
incorrect_svm = (output_test != svm_prediction).sum()
print("Incorrectly Classified Instances (SVM): ", incorrect_svm)

SVM Accuracy:  87.5
Incorrectly Classified Instances (SVM):  5


In [None]:
# Using Multilayer Perceptron (MLP)
mlp_model = MLPClassifier(max_iter=1000, learning_rate_init=0.001)
mlp_model.fit(input_train, output_train)
mlp_prediction = mlp_model.predict(input_test)
mlp_accuracy = accuracy_score(output_test, mlp_prediction) * 100
print("MLP Accuracy: ", mlp_accuracy)
incorrect_mlp = (output_test != mlp_prediction).sum()
print("Incorrectly Classified Instances (MLP): ", incorrect_mlp)

MLP Accuracy:  92.5
Incorrectly Classified Instances (MLP):  3


In [None]:
# Using Gaussian Naive Bayes for Question 4
gaussian_nb_model = GaussianNB()
gaussian_nb_model.fit(input_train, output_train)
gaussian_nb_prediction = gaussian_nb_model.predict(input_test)
gaussian_nb_accuracy = accuracy_score(output_test, gaussian_nb_prediction) * 100

#Calculate accuracy
print("Gaussian Naive Bayes Accuracy: ", gaussian_nb_accuracy)

# Calculate precision
gaussian_nb_precision = precision_score(output_test, gaussian_nb_prediction, average='weighted')
print("Gaussian Naive Bayes Precision: ", gaussian_nb_precision)

# Calculate recall
gaussian_nb_recall = recall_score(output_test, gaussian_nb_prediction, average='weighted')
print("Gaussian Naive Bayes Recall: ", gaussian_nb_recall)

Gaussian Naive Bayes Accuracy:  97.5
Gaussian Naive Bayes Precision:  0.9761363636363637
Gaussian Naive Bayes Recall:  0.975


In [None]:
# Using Random Forest with Monte Carlo cross-validation
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
monte_carlo_f1_scores = cross_val_score(rf_model, input_encoded, output_encoded, cv=5, scoring='f1_weighted')
print("Random Forest (Monte Carlo) F1 Scores:", monte_carlo_f1_scores)

Random Forest (Monte Carlo) F1 Scores: [1.         0.95405031 0.95463956 0.95425837 1.        ]


In [None]:
# Using Random Forest with Leave P-Out cross-validation
p_leave_out = 5
leave_p_out_cv = model_selection.LeavePOut(p=p_leave_out)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
leave_p_out_f1_scores = cross_val_score(rf_model, input_encoded, output_encoded, cv=leave_p_out_cv, scoring='f1_weighted')

In [None]:
# Calculate mean and standard deviation of F1 scores
mean_f1_score = leave_p_out_f1_scores.mean()
std_f1_score = leave_p_out_f1_scores.std()
# Print the results
print(f"{p_leave_out}-Leave-Out Cross-Validation Mean F1 Score: {mean_f1_score:.4f}")
print(f"{p_leave_out}-Leave-Out Cross-Validation Standard Deviation F1 Score: {std_f1_score:.4f}")