<a href="https://colab.research.google.com/github/thisaraniNJ/MachineLearning_CW/blob/main/MachineLearningCW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


**Dataset Cleaning Process**


In [104]:

# Load the CSV file using semicolon as the delimiter
df = pd.read_csv('/content/drive/MyDrive/ML CW/bank-full.csv', delimiter=';')

# Display the first few rows to check the data
print(df.head())

# Manually set the correct column names
columns = [
    'age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan',
    'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

# Ensure the DataFrame has the correct number of columns before setting
if len(df.columns) == len(columns):
    df.columns = columns
else:
    print(f"Warning: Column count mismatch. Found {len(df.columns)} columns, expected {len(columns)}.")

# Replace 'unknown' with NaN for easier handling
df.replace('unknown', pd.NA, inplace=True)

# List of categorical columns to process
categorical_cols = ["job", "marital", "education", "contact", "poutcome"]

for col in categorical_cols:
    # Check if the column exists before imputing
    if col in df.columns:
        mode_value = df[col].mode()[0]  # Get the mode value explicitly
        df.loc[:, col] = df[col].fillna(mode_value)  # Use df.loc for assignment
    else:
        print(f"Warning: Column '{col}' not found in DataFrame")

# Handle 'pdays' column (replace -1 with 0 for meaningful representation)
if 'pdays' in df.columns:
    df.loc[:, 'pdays'] = df['pdays'].apply(lambda x: 0 if x == -1 else x)
else:
    print("Warning: Column 'pdays' not found in DataFrame")

# Encode target variable 'y' (convert 'yes'/'no' to 1/0)
if 'y' in df.columns:
    # Replace 'unknown' in 'y' column with mode or specific value before mapping:
    mode_value_y = df['y'].mode()[0]  # Get the mode of 'y' column
    df.loc[:, 'y'] = df['y'].replace('unknown', mode_value_y)  # Replace 'unknown' with mode

    # Now map to numerical values
    df.loc[:, 'y'] = df['y'].map({'yes': 1, 'no': 0}).astype(int)
else:
    print("Warning: Column 'y' (target) not found in DataFrame")

# Display cleaned dataset info
print("\nCleaned Dataset Info:")
print(df.info())
print("\nCleaned Dataset:")
print(df.head())


   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  

Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns

Feature Engineering

In [105]:

# Step 1: Create new features (optional, based on the dataset)
# Create 'age_group' based on age ranges
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 50, 75, 100], labels=['0-25', '26-50', '51-75', '76+'])

# Create 'balance_category' based on account balance
df['balance_category'] = pd.cut(df['balance'], bins=[-float('inf'), 0, 1000, 10000, float('inf')],
                                labels=['low', 'medium', 'high', 'very_high'])

# Create 'duration_group' based on the duration of the last contact
df['duration_group'] = pd.cut(df['duration'], bins=[0, 300, 600, 1200, float('inf')],
                              labels=['short', 'medium', 'long', 'very_long'])

# Step 2: Select features to use in the model
# We will drop the original columns that were transformed (e.g., 'age', 'balance', 'duration') and use the new features
features_to_use = df.drop(columns=['y', 'age', 'balance', 'duration'])

# Step 3: Split the data into features (X) and target variable (y)
X = features_to_use
y = df['y']

# Step 4: Preprocessing pipeline for numerical and categorical features
# Define the numerical features (those that are not categorical)
numerical_cols = ['campaign', 'pdays', 'previous', 'day']

# Preprocessing for numerical data: Scaling using StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Numerical transformation pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Scale features to have mean=0 and variance=1
])

# Preprocessing for categorical data: One-hot encoding
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['job', 'marital', 'education', 'contact', 'poutcome', 'default', 'housing', 'loan', 'month',
                    'age_group', 'balance_category', 'duration_group']  # Updated list

# Categorical transformation pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent category
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine both transformations using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),  # Using updated numerical_cols
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 5: Fit the preprocessor on the data and transform it
X_processed = preprocessor.fit_transform(X)

# Step 6: View the transformed data (optional)
print(f"\nTransformed Data Shape: {X_processed.shape}")
print("\nTransformed Data (First 5 Rows):")
print(X_processed[:5])

# You now have your features preprocessed and ready to be used in a machine learning model



Transformed Data Shape: (45211, 56)

Transformed Data (First 5 Rows):
  (0, 0)	-0.5693506376457914
  (0, 1)	-0.41100886034711376
  (0, 2)	-0.25194037067217256
  (0, 3)	-1.2984763315738133
  (0, 8)	1.0
  (0, 16)	1.0
  (0, 20)	1.0
  (0, 21)	1.0
  (0, 23)	1.0
  (0, 26)	1.0
  (0, 29)	1.0
  (0, 30)	1.0
  (0, 40)	1.0
  (0, 46)	1.0
  (0, 48)	1.0
  (0, 54)	1.0
  (1, 0)	-0.5693506376457914
  (1, 1)	-0.41100886034711376
  (1, 2)	-0.25194037067217256
  (1, 3)	-1.2984763315738133
  (1, 13)	1.0
  (1, 17)	1.0
  (1, 19)	1.0
  (1, 21)	1.0
  (1, 23)	1.0
  :	:
  (3, 21)	1.0
  (3, 23)	1.0
  (3, 26)	1.0
  (3, 29)	1.0
  (3, 30)	1.0
  (3, 40)	1.0
  (3, 45)	1.0
  (3, 48)	1.0
  (3, 54)	1.0
  (4, 0)	-0.5693506376457914
  (4, 1)	-0.41100886034711376
  (4, 2)	-0.25194037067217256
  (4, 3)	-1.2984763315738133
  (4, 5)	1.0
  (4, 17)	1.0
  (4, 19)	1.0
  (4, 21)	1.0
  (4, 23)	1.0
  (4, 26)	1.0
  (4, 28)	1.0
  (4, 30)	1.0
  (4, 40)	1.0
  (4, 45)	1.0
  (4, 50)	1.0
  (4, 54)	1.0


In [106]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

# Check the shapes of the train and test sets
print(f"\nTraining Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")



Training Data Shape: (31647, 56)
Testing Data Shape: (13564, 56)


In [114]:
print("Unique values in y_train:", y_train.unique())
print("Data type of y_train:", y_train.dtype)


Unique values in y_train: [0 1]
Data type of y_train: object


In [115]:
from sklearn.preprocessing import LabelEncoder

# Encode 'yes' as 1 and 'no' as 0
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [116]:
# Train the model with encoded labels
rf_model.fit(X_train, y_train_encoded)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_rf_prob = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Random Forest Model Results:")
print("Accuracy:", accuracy_score(y_test_encoded, y_pred_rf))
print("Classification Report:\n", classification_report(y_test_encoded, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test_encoded, y_pred_rf_prob))


Random Forest Model Results:
Accuracy: 0.894426422884105
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94     11966
           1       0.58      0.39      0.46      1598

    accuracy                           0.89     13564
   macro avg       0.75      0.67      0.70     13564
weighted avg       0.88      0.89      0.89     13564

ROC-AUC Score: 0.8931862011201116


In [121]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder


# Define the Neural Network structure
model_nn = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer
    Dense(16, activation='relu'),  # Hidden layer
    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile the model
model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Add early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
# Convert y_train to float32
y_train_float = y_train.astype(np.float32)
history = model_nn.fit(
    X_train, y_train_float, # Use the converted y_train
    epochs=50,  # Maximum number of epochs
    batch_size=32,
    validation_split=0.2,  # Use 20% of training data for validation
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model on the test set
# Convert y_test to float32 as well
y_test_float = y_test.astype(np.float32)
loss, accuracy = model_nn.evaluate(X_test, y_test_float) # Use the converted y_test
y_pred_nn_prob = model_nn.predict(X_test).flatten()
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int)  # Convert probabilities to binary predictions


# Encode y_test using the same LabelEncoder used for y_train
# Assuming you have a label_encoder object from previous code
label_encoder = LabelEncoder() # Initialize or retrieve if it already exists
y_test_encoded = label_encoder.fit_transform(y_test) # Fit and transform or only transform if already fit


# Neural Network Results
print("\nNeural Network Model Results:")
print("Test Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test_encoded, y_pred_nn)) # Use encoded y_test
print("ROC-AUC Score:", roc_auc_score(y_test_encoded, y_pred_nn_prob)) # Use encoded y_test


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8474 - loss: 0.3523 - val_accuracy: 0.8994 - val_loss: 0.2347
Epoch 2/50
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8962 - loss: 0.2404 - val_accuracy: 0.8987 - val_loss: 0.2313
Epoch 3/50
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8982 - loss: 0.2400 - val_accuracy: 0.8997 - val_loss: 0.2283
Epoch 4/50
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9009 - loss: 0.2308 - val_accuracy: 0.9006 - val_loss: 0.2251
Epoch 5/50
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8998 - loss: 0.2311 - val_accuracy: 0.9030 - val_loss: 0.2237
Epoch 6/50
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.9059 - loss: 0.2211 - val_accuracy: 0.9017 - val_loss: 0.2241
Epoch 7/50
[1m792/792[0m 