In [None]:
# Date: 26/11/2023
# CSC461 – Assignment3 – Machine Learning
# FA21-BSE-114
# Umer Amir

**Question 1**

Q1: Provide responses to the following questions about the dataset.
1.	How many instances does the dataset contain?
Ans: 110 instances
2.	How many input attributes does the dataset contain?
Ans: 7 input instances. Height, weight, beard, hair length, shoe size, scarf, and eye color.
3.	How many possible values does the output attribute have?
Ans: There is only one output attribute gender that has two output values male and female.
4.	How many input attributes are categorical?
Ans: Beard, hair length, scarf, and eye color are the input attributes that are categorical.
5.	What is the dataset's class ratio (male vs female)?
Ans: 62 were males and 48 were females.

**Question 2**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('gender-prediction.csv')

# Preprocess the data
le = LabelEncoder()
df['beard'] = le.fit_transform(df['beard'])
df['hair_length'] = le.fit_transform(df['hair_length'])
df['scarf'] = le.fit_transform(df['scarf'])
df['eye_color'] = le.fit_transform(df['eye_color'])
df['gender'] = le.fit_transform(df['gender'])

# Define features (X) and target variable (y)
X = df.drop('gender', axis=1)
y = df['gender']

# Question 1: Train/test split ratio of 2/3
X_train_2_3, X_test_2_3, y_train_2_3, y_test_2_3 = train_test_split(X, y, test_size=1/3, random_state=42)

# Models
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Multilayer Perceptron': MLPClassifier()
}

# Results dictionary
results = {}

# Train and evaluate models
for model_name, model in models.items():
    # Cross-validation prediction for getting incorrect classifications
    y_pred_cv = cross_val_predict(model, X_train_2_3, y_train_2_3, cv=3)
    incorrect_instances = np.sum(y_pred_cv != y_train_2_3)
    results[model_name] = {'Incorrectly Classified Instances': incorrect_instances}

# Print results for Question 1
for model_name, result in results.items():
    print(f"{model_name}: {result['Incorrectly Classified Instances']} instances incorrectly classified.")

# Question 2: Rerun with 80/20 train/test split
X_train_4_5, X_test_4_5, y_train_4_5, y_test_4_5 = train_test_split(X, y, test_size=1/5, random_state=42)
results_80_20 = {}

for model_name, model in models.items():
    model.fit(X_train_4_5, y_train_4_5)
    y_pred_test = model.predict(X_test_4_5)
    incorrect_instances = np.sum(y_pred_test != y_test_4_5)
    results_80_20[model_name] = {'Incorrectly Classified Instances': incorrect_instances}

# Print results for Question 2
print("\nResults with 80/20 train/test split:")
for model_name, result in results_80_20.items():
    print(f"{model_name}: {result['Incorrectly Classified Instances']} instances incorrectly classified.")

# Question 3: Identify 2 most "powerful" attributes
# You can use feature importance, coefficients, or any other method depending on the model
# In this example, Logistic Regression is used to get feature importance
lr_model = LogisticRegression()
lr_model.fit(X_train_4_5, y_train_4_5)
feature_importance = lr_model.coef_[0]
top_attributes_indices = np.argsort(np.abs(feature_importance))[::-1][:2]
top_attributes = X.columns[top_attributes_indices]

print(f"\nTop 2 most 'powerful' attributes: {top_attributes[0]} and {top_attributes[1]}")

# Question 4: Exclude top 2 attributes and rerun with 80/20 split
X_excluded = X.drop(top_attributes, axis=1)
X_train_excluded, X_test_excluded, y_train_excluded, y_test_excluded = train_test_split(X_excluded, y, test_size=1/5, random_state=42)

results_excluded = {}

for model_name, model in models.items():
    model.fit(X_train_excluded, y_train_excluded)
    y_pred_test_excluded = model.predict(X_test_excluded)
    incorrect_instances = np.sum(y_pred_test_excluded != y_test_excluded)
    results_excluded[model_name] = {'Incorrectly Classified Instances': incorrect_instances}

# Print results for Question 4
print("\nResults after excluding top 2 attributes:")
for model_name, result in results_excluded.items():
    print(f"{model_name}: {result['Incorrectly Classified Instances']} instances incorrectly classified.")


**Question 3**

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

# Load your dataset
# Replace 'your_dataset.csv' with the actual path to your dataset
# Assuming the dataset has columns like 'height', 'weight', 'beard', 'hair_length', 'shoe_size', 'scarf', 'eye_color', 'gender'
df = pd.read_csv('gender-prediction.csv')

# Preprocess the data
le = LabelEncoder()
df['beard'] = le.fit_transform(df['beard'])
df['hair_length'] = le.fit_transform(df['hair_length'])
df['scarf'] = le.fit_transform(df['scarf'])
df['eye_color'] = le.fit_transform(df['eye_color'])
df['gender'] = le.fit_transform(df['gender'])
# Load your gender prediction dataset
# Replace 'your_dataset.csv' with the actual path to your dataset
# Assuming the dataset has columns like 'height', 'weight', 'beard', 'hair_length', 'shoe_size', 'scarf', 'eye_color', 'gender'

# Preprocess the data if needed
# Example: Encode categorical variables, handle missing values

# Define features (X) and target variable (y)
X = df.drop('gender', axis=1)
y = df['gender']

# Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Monte Carlo Cross-Validation
monte_carlo_f1_scores = []
iterations = 100

for _ in range(iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    monte_carlo_f1_scores.append(f1_score(y_test, y_pred, average='binary'))

monte_carlo_avg_f1 = np.mean(monte_carlo_f1_scores)

# Leave P-Out Cross-Validation
leave_p_out_f1_scores = []
p_out = 5  # You can adjust the number of folds as needed

fold = StratifiedKFold(n_splits=p_out, shuffle=True, random_state=42)

for train, test in fold.split(X, y):
    X_train, X_test = X.iloc[train], X.iloc[test]
    y_train, y_test = y.iloc[train], y.iloc[test]
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    leave_p_out_f1_scores.append(f1_score(y_test, y_pred, average='binary'))

leave_p_out_avg_f1 = np.mean(leave_p_out_f1_scores)

# Print F1 scores
print(f'Monte Carlo Cross-Validation F1 Score: {monte_carlo_avg_f1:.4f}')
print(f'Leave P-Out Cross-Validation F1 Score: {leave_p_out_avg_f1:.4f}')


Monte Carlo Cross-Validation F1 Score: 0.9524
Leave P-Out Cross-Validation F1 Score: 0.9759


**Question 4**

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

# Load your dataset
# Replace 'your_dataset.csv' with the actual path to your dataset
# Assuming the dataset has columns like 'height', 'weight', 'beard', 'hair_length', 'shoe_size', 'scarf', 'eye_color', 'gender'
df = pd.read_csv('gender-prediction.csv')

# Preprocess the data
le = LabelEncoder()
df['beard'] = le.fit_transform(df['beard'])
df['hair_length'] = le.fit_transform(df['hair_length'])
df['scarf'] = le.fit_transform(df['scarf'])
df['eye_color'] = le.fit_transform(df['eye_color'])
df['gender'] = le.fit_transform(df['gender'])
# Load the gender prediction dataset
# Replace 'your_dataset.csv' with the actual path to your dataset
# Assuming the dataset has columns like 'height', 'weight', 'beard', 'hair_length', 'shoe_size', 'scarf', 'eye_color', 'gender'

# Preprocess the data if needed
# Example: Encode categorical variables, handle missing values

# Define features (X) and target variable (y)
X = df.drop('gender', axis=1)
y = df['gender']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Gaussian Naïve Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Load the test dataset
# Replace 'test_dataset.csv' with the actual path to your test dataset
test_df = pd.read_csv('test.csv')

# Preprocess the test data if needed
# Example: Encode categorical variables, handle missing values
test_df['beard'] = le.fit_transform(test_df['beard'])
test_df['hair_length'] = le.fit_transform(test_df['hair_length'])
test_df['scarf'] = le.fit_transform(test_df['scarf'])
test_df['eye_color'] = le.fit_transform(test_df['eye_color'])
test_df['gender'] = le.fit_transform(test_df['gender'])
# Define features (X_test) and target variable (y_test)
X_test = test_df.drop('gender', axis=1)
y_test = test_df['gender']

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

# Report the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')


Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
