<a href="https://colab.research.google.com/github/vimesh630/ML_CW/blob/main/bank_full_preprocessed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Google drive and Load Dataset

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/ML Coursework/bank+marketing/bank/bank-full.csv"
data = pd.read_csv(file_path, sep=';')

# Display basic information
print("Dataset Overview:")
print(data.head())
print(data.info())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset Overview:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198       

# Handle Missing Values

In [None]:
# Check for missing values
print("Missing Values per Column:")
print(data.isnull().sum())

# If missing values are found, fill or handle them appropriately
# Example (no missing values expected in this dataset):
# data['column_name'].fillna(data['column_name'].mean(), inplace=True)

Missing Values per Column:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


# Encode Categorical Features

In [None]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_columns)

# Convert target variable `y` to binary
data['y'] = data['y'].map({'yes': 1, 'no': 0})

# Encode other categorical features
label_encoders = {}
for col in categorical_columns:
    if col != 'y':  # Skip the target column
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

print("Dataset After Encoding:")
print(data.head())

Categorical Columns: Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'y'],
      dtype='object')
Dataset After Encoding:
   age  job  marital  education  default  balance  housing  loan  contact  \
0   58    4        1          2        0     2143        1     0        2   
1   44    9        2          1        0       29        1     0        2   
2   33    2        1          1        0        2        1     1        2   
3   47    1        1          3        0     1506        1     0        2   
4   33   11        2          3        0        1        0     0        2   

   day  month  duration  campaign  pdays  previous  poutcome  y  
0    5      8       261         1     -1         0         3  0  
1    5      8       151         1     -1         0         3  0  
2    5      8        76         1     -1         0         3  0  
3    5      8        92         1     -1         0         3  0  
4    5      8       198    

# Feature Engineering

In [None]:
# Example: Remove irrelevant or non-informative columns
# data = data.drop(['column_name'], axis=1)

# Example: Create new features (e.g., age group)
data['age_group'] = pd.cut(data['age'], bins=[18, 30, 50, 70, 100], labels=['18-30', '30-50', '50-70', '70+'])
print("Dataset with New Features:")
print(data.head())

Dataset with New Features:
   age  job  marital  education  default  balance  housing  loan  contact  \
0   58    4        1          2        0     2143        1     0        2   
1   44    9        2          1        0       29        1     0        2   
2   33    2        1          1        0        2        1     1        2   
3   47    1        1          3        0     1506        1     0        2   
4   33   11        2          3        0        1        0     0        2   

   day  month  duration  campaign  pdays  previous  poutcome  y age_group  
0    5      8       261         1     -1         0         3  0     50-70  
1    5      8       151         1     -1         0         3  0     30-50  
2    5      8        76         1     -1         0         3  0     30-50  
3    5      8        92         1     -1         0         3  0     30-50  
4    5      8       198         1     -1         0         3  0     30-50  


# Split Dataset into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = data.drop('y', axis=1)
y = data['y']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shapes of Training and Test Sets:")
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Shapes of Training and Test Sets:
(36168, 17) (9043, 17) (36168,) (9043,)


# Scale Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns
numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
print("Numeric Columns for Scaling:", numeric_columns)

# Initialize the scaler
scaler = StandardScaler()

# Scale only numeric columns
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])

# Convert back to DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numeric_columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numeric_columns, index=X_test.index)

# Combine scaled numeric columns with non-numeric columns
X_train_processed = pd.concat([X_train_scaled_df, X_train.drop(numeric_columns, axis=1)], axis=1)
X_test_processed = pd.concat([X_test_scaled_df, X_test.drop(numeric_columns, axis=1)], axis=1)

print("Scaled and Processed Training Set:")
print(X_train_processed.head())


Numeric Columns for Scaling: Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')
Scaled and Processed Training Set:
            age       job   marital  education   default   balance   housing  \
3344   0.006515 -1.020575 -0.276680  -1.636573 -0.138113 -0.169381  0.892343   
17965  0.759937  1.423250 -0.276680  -1.636573 -0.138113  0.017848  0.892343   
18299  0.100693 -1.326053 -0.276680  -0.297306 -0.138113  0.820681 -1.120646   
10221 -0.370196 -0.104140  1.369745   1.041961 -0.138113 -0.489588  0.892343   
32192  1.419181 -1.020575 -0.276680  -1.636573 -0.138113  0.706889 -1.120646   

           loan   contact       day     month  duration  campaign     pdays  \
3344  -0.438594  1.517492 -0.098218  0.824500 -0.719756 -0.565886 -0.411364   
17965 -0.438594 -0.711462  1.703422 -0.174289  0.047138 -0.245389 -0.411364   
18299 -0.43

# Saving Preprocessed Data

In [None]:
# Save the preprocessed datasets to Google Drive
X_train_processed.to_csv('/content/drive/MyDrive/ML Coursework/Preprocessed Dataset/X_train.csv', index=False)
y_train.to_csv('/content/drive/MyDrive/ML Coursework/Preprocessed Dataset/y_train.csv', index=False)
X_test_processed.to_csv('/content/drive/MyDrive/ML Coursework/Preprocessed Dataset/X_test.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/ML Coursework/Preprocessed Dataset/y_test.csv', index=False)

print("Preprocessed datasets have been saved to Google Drive.")

Preprocessed datasets have been saved to Google Drive.
