<a href="https://colab.research.google.com/github/vimesh630/ML_CW/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries and Mount Google Drive

In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the paths
drive_path = '/content/drive/MyDrive/ML Coursework/'
file1_path = os.path.join(drive_path, 'bank+marketing/bank-additional/bank-additional/bank-additional-full.csv')
file2_path = os.path.join(drive_path, 'bank+marketing/bank-additional/bank-additional/bank-additional.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load and Merge Datasets

In [8]:
# Load the datasets
df1 = pd.read_csv(file1_path, sep=';')
df2 = pd.read_csv(file2_path, sep=';')

# Merge the datasets
data = pd.concat([df1, df2], axis=0, ignore_index=True)

# Verify the merged dataset
print("Columns after loading:")
print(data.columns)

Columns after loading:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')


Handle Missing Values

In [9]:
# Handle missing values for numerical and categorical columns
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Impute missing values for numerical columns
for col in numerical_cols:
    data[col] = numerical_imputer.fit_transform(data[[col]])

# Impute missing values for categorical columns
for col in categorical_cols:
    data[col] = categorical_imputer.fit_transform(data[[col]].values.reshape(-1, 1)).ravel()

# Verify no missing values remain
print("Missing values after imputation:")
print(data.isnull().sum())

Missing values after imputation:
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


Ensure the 'y' Column Exists

In [10]:
# Verify that the 'y' column exists before processing
print("Columns before target conversion:")
print(data.columns)
print("First few rows of 'y':")
print(data['y'].head())

Columns before target conversion:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
First few rows of 'y':
0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object


Convert Target Variable to Binary

In [11]:
# Convert 'y' column to binary
data['y'] = data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Confirm the conversion
print("Target value counts after conversion:")
print(data['y'].value_counts())

Target value counts after conversion:
y
0    40216
1     5091
Name: count, dtype: int64


Encode Categorical Variables

In [13]:
# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categorical_encoded = pd.DataFrame(
    encoder.fit_transform(data[categorical_cols]),
    columns=encoder.get_feature_names_out(categorical_cols),
    index=data.index
)

# Drop original categorical columns and concatenate encoded columns
data = pd.concat([data.drop(columns=categorical_cols), categorical_encoded], axis=1)

# Verify encoding
print("Data after encoding:")
print(data.head())

Data after encoding:
    age  duration  campaign  pdays  previous  emp.var.rate  cons.price.idx  \
0  56.0     261.0       1.0  999.0       0.0           1.1          93.994   
1  57.0     149.0       1.0  999.0       0.0           1.1          93.994   
2  37.0     226.0       1.0  999.0       0.0           1.1          93.994   
3  40.0     151.0       1.0  999.0       0.0           1.1          93.994   
4  56.0     307.0       1.0  999.0       0.0           1.1          93.994   

   cons.conf.idx  euribor3m  nr.employed  ...  day_of_week_fri  \
0          -36.4      4.857       5191.0  ...              0.0   
1          -36.4      4.857       5191.0  ...              0.0   
2          -36.4      4.857       5191.0  ...              0.0   
3          -36.4      4.857       5191.0  ...              0.0   
4          -36.4      4.857       5191.0  ...              0.0   

   day_of_week_mon  day_of_week_thu  day_of_week_tue  day_of_week_wed  \
0              1.0              0.0     

Handle Class Imbalance with SMOTE

In [15]:
# Verify the presence of 'y' before separating features and target
print("Columns in DataFrame:")
print(data.columns)

# Check the first few rows of 'y'
if 'y' in data.columns:
    print("First few rows of 'y':")
    print(data['y'].head())
else:
    print("The 'y' column is missing! Reloading and processing the dataset.")

    # Reload and process the dataset
    data = pd.concat([df1, df2], axis=0, ignore_index=True)

    # Convert 'y' column to binary
    data['y'] = data['y'].apply(lambda x: 1 if x == 'yes' else 0)

    # Verify after conversion
    print("Target value counts after conversion:")
    print(data['y'].value_counts())

# Separate features and target
X = data.drop(columns=['y'])
y = data['y']

# Verify separation
print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Verify SMOTE results
print("Shapes after SMOTE:")
print("Features:", X_resampled.shape)
print("Target:", y_resampled.shape)


Columns in DataFrame:
Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 



ValueError: could not convert string to float: 'housemaid'

Split Dataset into Training and Testing Sets

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# Verify the split
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

NameError: name 'X_resampled' is not defined

Scale Features for Neural Networks

In [None]:
# Scale features for Neural Networks
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify scaling
print("Scaled training set shape:", X_train_scaled.shape)
print("Scaled testing set shape:", X_test_scaled.shape)

Save Files to Google Drive

In [None]:
# Create an output directory
output_dir = os.path.join(drive_path, '/Preprocessed Dataset')
os.makedirs(output_dir, exist_ok=True)

# Save files
pd.DataFrame(X_train).to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
pd.DataFrame(X_test).to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
pd.DataFrame(X_train_scaled).to_csv(os.path.join(output_dir, 'X_train_scaled.csv'), index=False)
pd.DataFrame(X_test_scaled).to_csv(os.path.join(output_dir, 'X_test_scaled.csv'), index=False)
pd.DataFrame(y_train).to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
pd.DataFrame(y_test).to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)

print("Preprocessing complete. Files saved to Google Drive.")