<a href="https://colab.research.google.com/github/zurii-07/CM2604-ML-Coursework/blob/Develop/ML_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Loading & Exploring the dataset.

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd
import numpy as np

#Loading the dataset.
file_path = '/content/drive/MyDrive/ML CW /bank+marketing/bank-additional/bank-additional/bank-additional-full.csv'
data = pd.read_csv(file_path, sep=';')

#Exploring the dataset
print(data.head())
print(data.info())
print(data.describe())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

2. Data Preprocessing

In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#Data Preprocessing

# 01. Checking for Missing values labeled as "Unknown" in categorical features.
categoricalFeatures = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

unknownCount = {col: (data[col] == 'unknown').sum() for col in categoricalFeatures}
print("01.Number of 'Unknown' values in categorical features: ")
print(unknownCount)
print("\n")


# 1.1 Replacing missing "Unknown" values with the most frequent category.
for col in categoricalFeatures:
    mostFrequentCategory = data[col].mode()[0]
    data[col] = data[col].replace('unknown', mostFrequentCategory)

print("1.1.After replacing 'Unknown' values with the most frequent category: ")
print("\n")
print(data.head(10))
print("\n")

# 1.2 Getting counts for each category in categorical features.
print("1.2.Counts for each category in categorical features: ")
print("\n")
for col in categoricalFeatures:
    print(f"Counts for the feature, '{col}':")
    print(data[col].value_counts())
    print("\n")

# 02. Feature Engineering / Encoding Categorical Features.

#Lable encoding the target variable 'y'
labelEncoder = LabelEncoder()
data['y'] = labelEncoder.fit_transform(data['y'])

#One-hot encoding the categorical features.
encodedData = pd.get_dummies(data, columns=categoricalFeatures, drop_first=True)

#Displaying the processed data.
print("\n 02.Processed Data with Encoded Categorical Features(with First 5 rows):")
print("\n")
print(encodedData.head(10))
print("\n")


# 3. Data Normalization / Scaling

#Identifying numerical features
numericalFeatures = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

#Initializing MinMaxScaler or StandardScaler to normalize the data.
scaler = MinMaxScaler()

#Scaling the numerical features.
scaledNumericalData = scaler.fit_transform(encodedData[numericalFeatures])
scaledNumericalData = pd.DataFrame(scaledNumericalData, columns=numericalFeatures)

# Replacing the scaled numerical features in the dataset.
encodedData[numericalFeatures] = scaledNumericalData

#Displaying the scaled data.
print("\n 03.Scaled Data (with First 5 rows):")
print("\n")
print(encodedData.head(10))
print("\n")


# 04. Handling Class Imbalances using SMOTE.

from imblearn.over_sampling import SMOTE

#separating features and target variable.
X = encodedData.drop('y', axis=1)
y = encodedData['y']

#Initializing SMOTE to handle class imbalances.
smote = SMOTE(random_state=42)

# Applying SMOTE to generate balanced data.
X_resampled, y_resampled = smote.fit_resample(X, y)

#Creating a new DataFrame with the balanced dataset.
balancedData = pd.DataFrame(X_resampled, columns=X.columns)
balancedData['y'] = y_resampled

#Displaying the balanced data.
print("\n 04.Balanced Data after applying SMOTE (Class Distriution):")
print("\n")
print(balancedData['y'].value_counts())
print("\n")


01.Number of 'Unknown' values in categorical features: 
{'job': 330, 'marital': 80, 'education': 1731, 'default': 8597, 'housing': 990, 'loan': 990, 'contact': 0, 'month': 0, 'day_of_week': 0, 'poutcome': 0}


1.1.After replacing 'Unknown' values with the most frequent category: 


   age          job  marital            education default housing loan  \
0   56    housemaid  married             basic.4y      no      no   no   
1   57     services  married          high.school      no      no   no   
2   37     services  married          high.school      no     yes   no   
3   40       admin.  married             basic.6y      no      no   no   
4   56     services  married          high.school      no      no  yes   
5   45     services  married             basic.9y      no      no   no   
6   59       admin.  married  professional.course      no      no   no   
7   41  blue-collar  married    university.degree      no      no   no   
8   24   technician   single  professional.course  




 04.Balanced Data after applying SMOTE (Class Distriution):


y
0    36548
1    36548
Name: count, dtype: int64




3. Saving the processed dataset as a new Dataset.

In [10]:
#Saving the above cleaned data to a new csv file.

newFilePath = '/content/drive/MyDrive/ML CW /bank-additional-full-cleaned.csv'
balancedData.to_csv(newFilePath, index=False)
print(f"Processed and balanced dataset saved to {newFilePath}")

Processed and balanced dataset saved to /content/drive/MyDrive/ML CW /bank-additional-full-cleaned.csv


4. Splitting the Dataset

In [None]:
# 04. Splitting the dataset into training(60%), validation(20%) and testing(20%) sets.

from sklearn.model_selection import train_test_split
