# **Load data set & Get idea**

In [None]:
import pandas as pd


data = pd.read_csv("https://raw.githubusercontent.com/vihanga-induwara/CM2604-CW-Bank-Marketing/refs/heads/main/bank%2Bmarketing/bank-additional/bank-additional/bank-additional-full.csv", sep=";")


In [None]:
data.head()

In [None]:
# Check column names and data types
print(data.info())

In [None]:
# Check for missing values
print(data.isnull().sum())

In [None]:
# Summary statistics
print(data.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Age distribution
sns.histplot(data['age'], bins=20, kde=True, color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Countplot for target variable
sns.countplot(x='y', data=data, palette='pastel')
plt.title('Distribution of Campaign Outcome (Balanced vs. Imbalanced)')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()


In [None]:
# Calculate percentage distribution
outcome_counts = data['y'].value_counts(normalize=True) * 100
print(outcome_counts)

# Visualize as a pie chart
outcome_counts.plot.pie(autopct='%1.1f%%', labels=['No', 'Yes'], colors=['lightblue', 'orange'])
plt.title('Percentage Distribution of Campaign Outcome')
plt.ylabel('')  # Remove y-axis label
plt.show()


# **Preprocess Data**

## **idea of data**

In [None]:

# Display column details: type and number of unique categories
for column in data.columns:
    print(f"Column: {column}")
    print(f"Type: {data[column].dtype}")
    print(f"Number of unique values: {data[column].nunique()}")

    # If the column is categorical, print the unique values (categories)
    if data[column].dtype == 'object':
        print(f"Categories: {data[column].unique()}")
    print("-" * 40)


## **age**

In [None]:

data["age"].value_counts()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.hist(data["age"], bins=30, color='skyblue', edgecolor='black')
plt.title("Distribution of Age", fontsize=16)
plt.xlabel("Age", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import pandas as pd

# Calculate Q1 (25th percentile), Q3 (75th percentile), and IQR
Q1 = data['age'].quantile(0.25)
Q3 = data['age'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

# Filter the data to exclude outliers
data = data[(data['age'] >= lower_bound) & (data['age'] <= upper_bound)]

# Print the shape of the dataset before and after removing outliers
print(f"Original data shape: {data.shape}")
print(f"Data shape after removing outliers: {data.shape}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.hist(data["age"], bins=30, color='skyblue', edgecolor='black')
plt.title("Distribution of Age", fontsize=16)
plt.xlabel("Age", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create a scaler instance
scaler = MinMaxScaler()

# Normalize the 'age' column
data['age'] = scaler.fit_transform(data[['age']])

# Display the first few rows of the normalized column
print(data[['age']].head())


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.hist(data["age"], bins=30, color='skyblue', edgecolor='black')
plt.title("Distribution of Age", fontsize=16)
plt.xlabel("Age", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


## **job**

In [None]:

data["job"].value_counts()

In [None]:
import pandas as pd

# Assuming you have a DataFrame called 'data'
data_encoded = pd.get_dummies(data['job'], prefix='job', drop_first=False)

# If you want to concatenate it with the original dataframe
data = pd.concat([data, data_encoded], axis=1)

# Display the first few rows of the updated dataframe
print(data.head())


In [None]:
# Drop the 'job' column in-place
data.drop(columns=['job'], inplace=True)

# Verify if the column is removed
print(data.head())


## **marital**

In [None]:
data["marital"].value_counts()

In [None]:
# Replace unknown with the most frequent category
most_frequent_marital = data['marital'].mode()[0]
data['marital'].replace('unknown', most_frequent_marital, inplace=True)


In [None]:
data["marital"].value_counts()

In [None]:
# One-hot encode the 'marital' column
data = pd.get_dummies(data, columns=['marital'], drop_first=False)

# Check the result
print(data.head())


## **education**

In [None]:

data["education"].value_counts()

In [None]:
# Find the most frequent category in the 'education' column
most_frequent_education = data['education'].mode()[0]

# Replace 'illiterate' with the most frequent category
data['education'] = data['education'].replace('illiterate', most_frequent_education)




In [None]:
# Check the value counts after replacement
print(data['education'].value_counts())

In [None]:
# One-hot encode the 'education' column
education_encoded = pd.get_dummies(data['education'], prefix='education')

# Join the one-hot encoded columns back to the original DataFrame
data = pd.concat([data, education_encoded], axis=1)

# Drop the original 'education' column
data.drop('education', axis=1, inplace=True)

# Print the updated DataFrame
print(data.head())


## **default**

In [None]:

data["default"].value_counts()

In [None]:
# Drop the 'default' column if it doesn't provide useful information
data = data.drop('default', axis=1)

# Check the remaining columns
print(data.columns)


## **housing**

In [None]:

data["housing"].value_counts()

In [None]:
# Replace 'unknown' with the most frequent value ('yes')
data['housing'] = data['housing'].replace('unknown', 'yes')

# Check the updated value counts
print(data['housing'].value_counts())


In [None]:
# One-hot encode the 'housing' column
housing_encoded = pd.get_dummies(data['housing'], prefix='housing')

# Join the encoded columns back to the original dataframe
data = pd.concat([data, housing_encoded], axis=1)

# Drop the original 'housing' column if it's no longer needed
data = data.drop(columns=['housing'])

# Check the updated dataframe
print(data.head())


## **loan**

In [None]:

data["loan"].value_counts()

In [None]:
# Replace 'unknown' with the most frequent value ('yes')
data['loan'] = data['loan'].replace('unknown', 'yes')

# Check the updated value counts
print(data['loan'].value_counts())


In [None]:
# One-hot encode the 'loan' column
loan_encoded = pd.get_dummies(data['loan'], prefix='loan')

# Join the encoded columns back to the original dataframe
data = pd.concat([data, loan_encoded], axis=1)

# Drop the original 'loan' column if it's no longer needed
data = data.drop(columns=['loan'])

# Check the updated dataframe
print(data.head())


## **contact**

In [None]:

data["contact"].value_counts()

In [None]:
# Drop the 'contact' column
data = data.drop(columns=['contact'])

# Check the updated dataframe
print(data.head())


## **month**

In [None]:

data["month"].value_counts()

In [None]:
import matplotlib.pyplot as plt

# Plot the value counts of the 'month' column
data['month'].value_counts().plot(kind='bar', color='skyblue')

# Set labels and title
plt.title('Distribution of Month Values')
plt.xlabel('Month')
plt.ylabel('Frequency')

# Show the plot
plt.xticks(rotation=45)
plt.show()


In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame
data = pd.get_dummies(data, columns=['month'], drop_first=False)

# Display the transformed data to check the result
print(data.head())


## **day_of_week**

In [None]:

data["day_of_week"].value_counts()

In [None]:
import matplotlib.pyplot as plt

# Plot the value counts of the 'day_of_week' column
data['day_of_week'].value_counts().plot(kind='bar', color='skyblue')

# Set labels and title
plt.title('Distribution of Days')
plt.xlabel('Day')
plt.ylabel('Frequency')

# Show the plot
plt.xticks(rotation=45)
plt.show()


In [None]:
# Drop the 'day_of_week' column in-place
data.drop(columns=['day_of_week'], inplace=True)

# Verify if the column is removed
print(data.head())


## **duration**

In [None]:

data["duration"].value_counts()

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of the 'duration' column
plt.figure(figsize=(10, 6))
plt.hist(data['duration'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Duration')
plt.xlabel('Duration')
plt.ylabel('Frequency')
plt.show()


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Reshape the data as it is a single column
scaler = MinMaxScaler()
data['duration'] = scaler.fit_transform(data[['duration']])

# Check the result
print(data[['duration']].head())


## **campaign**

In [None]:

data["campaign"].value_counts()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Reshape the data as it is a single column
scaler = MinMaxScaler()
data['campaign'] = scaler.fit_transform(data[['campaign']])

# Check the result
print(data[['campaign']].head())


In [None]:

data["campaign"].value_counts()

## **pdays**

In [None]:

data["pdays"].value_counts()

In [None]:
import matplotlib.pyplot as plt

# Get the value counts of the 'pdays' column
pdays_counts = data["pdays"].value_counts()

# Plot the result
pdays_counts.plot(kind='bar', figsize=(10,6), color='skyblue')

# Adding labels and title
plt.xlabel('pdays Values')
plt.ylabel('Frequency')
plt.title('Value Counts of pdays')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Drop the 'pdays' column
data = data.drop(columns=['pdays'])
print(data.head())

## **previous**

In [None]:

data["previous"].value_counts()

In [None]:
import matplotlib.pyplot as plt

# Plot the value counts for the 'previous' column
data['previous'].value_counts().sort_index().plot(kind='bar')

# Adding labels and title
plt.xlabel('Previous')
plt.ylabel('Count')
plt.title('Distribution of Previous Column')

# Show the plot
plt.show()


In [None]:
# Drop the 'pdays' column
data = data.drop(columns=['previous'])
print(data.head())

## **poutcome**

In [None]:

data["poutcome"].value_counts()

In [None]:
# Drop the 'pdays' column
data = data.drop(columns=['poutcome'])
print(data.head())

## **emp.var.rate**

In [None]:

data["emp.var.rate"].value_counts()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data['emp.var.rate'], kde=True, bins=30)  # kde=True adds a Kernel Density Estimate
plt.title('Histogram of Employment Variation Rate with Outliers')
plt.xlabel('Employment Variation Rate')
plt.ylabel('Frequency')
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the 'emp.var.rate' column using StandardScaler
scaler = StandardScaler()
data['emp.var.rate'] = scaler.fit_transform(data[['emp.var.rate']])


In [None]:

data["emp.var.rate"].value_counts()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data['emp.var.rate'], kde=True, bins=30)  # kde=True adds a Kernel Density Estimate
plt.title('Histogram of Employment Variation Rate with Outliers')
plt.xlabel('Employment Variation Rate')
plt.ylabel('Frequency')
plt.show()


## **cons.price.idx**

In [None]:

data["cons.price.idx"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histogram
plt.figure(figsize=(10,6))
sns.histplot(data['cons.price.idx'], kde=True, bins=30)
plt.title('Distribution of cons.price.idx')
plt.xlabel('cons.price.idx')
plt.ylabel('Frequency')
plt.show()


In [None]:
Q1 = data['cons.price.idx'].quantile(0.25)
Q3 = data['cons.price.idx'].quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers = data[(data['cons.price.idx'] < (Q1 - 1.5 * IQR)) | (data['cons.price.idx'] > (Q3 + 1.5 * IQR))]

# Print the outliers
print(outliers)


In [None]:
# Remove outliers
data_no_outliers = data[(data['cons.price.idx'] >= (Q1 - 1.5 * IQR)) & (data['cons.price.idx'] <= (Q3 + 1.5 * IQR))]

# Verify the data without outliers
print(data_no_outliers.head())


In [None]:
# Value counts for cons.price.idx
print(data['cons.price.idx'].value_counts())


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Reshape the data (as it should be a 2D array for scaling)
data['cons.price.idx'] = scaler.fit_transform(data[['cons.price.idx']])

# Check the normalized values
print(data[['cons.price.idx']].head())


## **cons.conf.idx**

In [None]:

data["cons.conf.idx"].value_counts()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Reshape the data (as it should be a 2D array for scaling)
data['cons.conf.idx_normalized'] = scaler.fit_transform(data[['cons.conf.idx']])

# Check the normalized values
print(data[['cons.conf.idx', 'cons.conf.idx_normalized']].head())


In [None]:
# Drop the 'cons.conf.idx' column
data = data.drop(columns=['cons.conf.idx'])
print(data.head())

## **euribor3m**

In [None]:

data["euribor3m"].value_counts()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Reshape the data (since MinMaxScaler expects 2D data)
data['euribor3m_normalized'] = scaler.fit_transform(data[['euribor3m']])

# Check the normalized values
print(data[['euribor3m', 'euribor3m_normalized']].head())


In [None]:
# Drop the 'euribor3m' column
data = data.drop(columns=['euribor3m'])
print(data.head())

## **nr.employed**

In [None]:

data["nr.employed"].value_counts()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Reshape the data (since MinMaxScaler expects 2D data)
data['nr.employed_normalized'] = scaler.fit_transform(data[['nr.employed']])

# Check the normalized values
print(data[['nr.employed', 'nr.employed_normalized']].head())


In [None]:
# Drop the 'nr.employed' column
data = data.drop(columns=['nr.employed'])
print(data.head())

## **Y**

In [None]:
# Encode the 'y' column
data['y'] = data['y'].map({'yes': 1, 'no': 0})

# Verify the encoding
print(data['y'].value_counts())


## **After Colum wised prepoesed**

In [None]:
data

In [None]:
# Display all columns in the DataFrame
pd.set_option('display.max_columns', None)  # Show all columns
print(data.head())  # Display the first few rows

## **handle data set imbalance**

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
import pandas as pd

# Split data into features and target
X = data.drop(columns=['y'])
y = data['y']

# Split into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check class distribution before resampling
print("Before SMOTE:", Counter(y_train))

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("After SMOTE:", Counter(y_train_resampled))

# Scale the data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Combine the resampled data into a single DataFrame (optional)
balanced_data = pd.DataFrame(X_train_resampled, columns=X.columns)  # Use original column names
balanced_data['y'] = y_train_resampled  # Add the target label column

# Now you can train your model using X_train_resampled and y_train_resampled


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter

# Count class distribution before and after SMOTE
before_counts = Counter(y_train)  # Original training data (imbalanced)
after_counts = Counter(y_train_resampled)  # Resampled training data (balanced)

# Create a DataFrame for visualization
df = pd.DataFrame({
    'Class': ['No', 'Yes'],
    'Before SMOTE': [before_counts[0], before_counts[1]],
    'After SMOTE': [after_counts[0], after_counts[1]]
})

# Plot the class distribution
df.set_index('Class').plot(kind='bar', figsize=(8, 5), color=['skyblue', 'orange'])
plt.title('Class Distribution Before and After SMOTE')
plt.ylabel('Count')
plt.xlabel('Class')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Dataset')
plt.tight_layout()
plt.show()


In [None]:
# Display all columns in the DataFrame
pd.set_option('display.max_columns', None)  # Show all columns
print(data.head())  # Display the first few rows

# **Feature engineering**

## **Feature Extraction**

In [None]:
from sklearn.decomposition import PCA
import pandas as pd

# Define features for PCA
features = ['emp.var.rate', 'cons.price.idx','cons.conf.idx_normalized', 'euribor3m_normalized', 'nr.employed_normalized']

# Perform PCA
pca = PCA(n_components=1)
principal_components = pca.fit_transform(data[features])

# Add PC1 to the original dataset
data['PC1'] = principal_components

# Drop the original columns used for PCA
data = data.drop(columns=features)

# Print the explained variance ratio for PCA
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)


In [None]:
data

## **Feature Selection**

### **Random forest model**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

# Get categorical and numerical columns (modified to handle no categorical columns)
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocess categorical features with OneHotEncoder (only if there are categorical columns)
if len(categorical_cols) > 0:
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ]
    )

    # Apply the preprocessor and fit the transformer
    X_encoded = preprocessor.fit_transform(X)

    # Get the names of the encoded columns
    encoded_columns = preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols)

    # Combine numerical and encoded feature names
    all_columns = list(numerical_cols) + list(encoded_columns)
else:
    # If no categorical columns, use numerical columns directly
    X_encoded = X[numerical_cols].values  # Convert to NumPy array
    all_columns = list(numerical_cols)

# Fit a random forest model
rf = RandomForestClassifier()
rf.fit(X_encoded, y)

# Get feature importance
feature_importance = pd.Series(rf.feature_importances_, index=all_columns)

# Display features with low importance
weak_features_rf = feature_importance[feature_importance < 0.01]
print("Features with low importance (Random Forest):")
print(weak_features_rf)

### **Lasso model**

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit a Lasso model
lasso = Lasso(alpha=0.01)  # Adjust alpha as necessary
lasso.fit(X_scaled, y)

# Get the coefficients of the model
coef = pd.Series(lasso.coef_, index=X.columns)

# Show features with zero or near-zero coefficients (less important)
weak_features_lasso = coef[coef == 0]
print("Features with near-zero coefficients (less important):")
print(weak_features_lasso)


### **correlation**

In [None]:
# Calculate correlation between each feature and target variable `y`
correlation_matrix = data.corr()

# Find correlation with 'y'
correlation_with_y = correlation_matrix['y'].sort_values(ascending=False)

# Display features with weak correlation (e.g., correlation less than a threshold)
weak_correlations = correlation_with_y[abs(correlation_with_y) < 0.1]
print("Features with weak correlation to y:")
print(weak_correlations)


### **ANOVA F-test**

In [None]:
from sklearn.feature_selection import f_classif

X = data.drop(columns=['y'])
y = data['y']

# Perform ANOVA F-test
F_values, p_values = f_classif(X, y)

# Create a DataFrame to show the F-values and p-values for each feature
feature_scores = pd.DataFrame({'Feature': X.columns, 'F-value': F_values, 'p-value': p_values})

# Features with high p-value (>0.05) indicate less significance
weak_features = feature_scores[feature_scores['p-value'] > 0.05]
print("Features with weak effect on y (p-value > 0.05):")
print(weak_features)


### **Output**

In [None]:
data

### **Combine and drop**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif


# 1. Random Forest Feature Importance
rf = RandomForestClassifier()
rf.fit(X, y)
feature_importance = pd.Series(rf.feature_importances_, index=X.columns)
weak_features_rf = feature_importance[feature_importance < 0.01].index

# 2. Lasso Model (near-zero coefficients)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
lasso = Lasso(alpha=0.01)
lasso.fit(X_scaled, y)
lasso_coef = pd.Series(lasso.coef_, index=X.columns)
weak_features_lasso = lasso_coef[lasso_coef == 0].index

# 3. Correlation with target variable y (weak correlation)
correlation_matrix = X.corrwith(y)
weak_correlations = correlation_matrix[abs(correlation_matrix) < 0.1].index

# 4. ANOVA F-test (high p-value indicates weak effect)
F_values, p_values = f_classif(X, y)
feature_scores = pd.DataFrame({'Feature': X.columns, 'F-value': F_values, 'p-value': p_values})
weak_features_anova = feature_scores[feature_scores['p-value'] > 0.05]['Feature']

# Combine the outputs into a single DataFrame, ensuring only the feature names (index) are included
summary_df = pd.DataFrame({
    'Random_Forest_Importance': pd.Series(list(weak_features_rf), index=weak_features_rf),
    'Lasso_Coefficients': pd.Series(list(weak_features_lasso), index=weak_features_lasso),
    'Correlation_with_y': pd.Series(list(weak_correlations), index=weak_correlations),
    'ANOVA_p_value': pd.Series(list(weak_features_anova), index=weak_features_anova)
})

# Set a threshold for features being weak (if they appear weak in multiple methods)
threshold = 2  # Number of methods in which a feature is considered weak

# Count how many times each feature appears in the "weak" category
weak_features_count = (summary_df.notna()).sum(axis=1)

# Features to remove: those that are weak in at least `threshold` methods
features_to_remove = weak_features_count[weak_features_count >= threshold].index

# Print features that have weak effects on y
print("Features with weak effect across multiple methods:")
print(features_to_remove)

# Now, remove the least important features
X_reduced = X.drop(columns=features_to_remove)

# Optional: If you want to remove exactly the top N weak features, you can sort and select the top N:
top_n_to_remove = 5
sorted_weak_features = weak_features_count.sort_values(ascending=False).head(top_n_to_remove).index
X_reduced = X.drop(columns=sorted_weak_features)

# Print the final set of features to be removed
print(f"Top {top_n_to_remove} features to remove based on combined methods:")
print(sorted_weak_features)


In [None]:
# Drop the columns listed in sorted_weak_features from the 'data' DataFrame
data_reduced = data.drop(columns=sorted_weak_features)

# Print the resulting DataFrame after removal
print("Data after dropping the weak features:")
print(data_reduced.head())  # Display the first few rows of the reduced dataset

# Optionally, assign the reduced DataFrame back to the original variable `data`
data = data_reduced

In [None]:
data

# **Train Model**

## **Import Libraries**

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


## **Load and Split Data**

In [None]:
# Assuming 'df' is your DataFrame and 'y' is the target variable (subscription status)
X = data.drop('y', axis=1)  # Features
y = data['y']  # Target variable

# Splitting data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Before SMOTE
original_counts = Counter(y_train)

# After SMOTE
resampled_counts = Counter(y_train_resampled)

# Create a DataFrame for visualization
df = pd.DataFrame({
    'Class': ['No', 'Yes'],
    'Original': [original_counts[0], original_counts[1]],
    'After SMOTE': [resampled_counts[0], resampled_counts[1]]
})

# Plot the bar chart
df.set_index('Class').plot(kind='bar', figsize=(8, 5), color=['skyblue', 'orange'])
plt.title('Class Distribution Before and After SMOTE')
plt.ylabel('Count')
plt.xlabel('Class')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Dataset')
plt.show()


## **Feature Scaling**

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the training data and transform the test data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## **Model 1 - Random Forest Classifier**

In [None]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Model Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))


## **Model 2 - Neural Network (MLPClassifier)**

In [None]:
# Importing necessary libraries
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the MLPClassifier (Neural Network)
nn_model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42)

# Train the model on the scaled training data
nn_model.fit(X_train, y_train)

# Make predictions on the test set
nn_predictions = nn_model.predict(X_test)

# Evaluate the model by calculating accuracy and classification report
print("Neural Network Model Accuracy:", accuracy_score(y_test, nn_predictions))
print("Neural Network Classification Report:\n", classification_report(y_test, nn_predictions))


# **Optimize the Model**

## **RandomForestClassifier GridSearchCV for hyperparameter optimization**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter optimization
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_rf_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# Predictions using the best model
rf_predictions = best_rf_model.predict(X_test)

# Evaluate the model
print("Optimized Random Forest Model Accuracy:", accuracy_score(y_test, rf_predictions))
print("Optimized Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))


## **MLPClassifier GridSearchCV for hyperparameter optimization**

In [None]:
# Importing necessary libraries
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Initialize the MLPClassifier
nn_model = MLPClassifier(max_iter=500, random_state=42)

# Define the parameter grid for hyperparameter optimization
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  # Various layer configurations
    'activation': ['relu', 'tanh', 'logistic'],                # Activation functions
    'solver': ['adam', 'sgd', 'lbfgs'],                        # Optimization solvers
    'alpha': [0.0001, 0.001, 0.01],                            # L2 regularization parameter
    'learning_rate': ['constant', 'adaptive'],                 # Learning rate schedule
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=nn_model, param_grid=param_grid,
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_nn_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# Make predictions using the best model
nn_predictions = best_nn_model.predict(X_test)

# Evaluate the optimized model
print("Optimized Neural Network Model Accuracy:", accuracy_score(y_test, nn_predictions))
print("Optimized Neural Network Classification Report:\n", classification_report(y_test, nn_predictions))


# **Evaluate the Model after Optimize**

## **Evaluation for MLPClassifier**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# For MLPClassifier
mlp_best_model = grid_search.best_estimator_

# Predict on test data
y_pred_mlp = mlp_best_model.predict(X_test)

# Evaluate the MLP model performance
print("MLP Model Evaluation:")
print("Classification Report:")
print(classification_report(y_test, y_pred_mlp))
print("Confusion Matrix:")
conf_matrix_mlp = confusion_matrix(y_test, y_pred_mlp)

# Plot confusion matrix for MLP
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix_mlp, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("Confusion Matrix - MLPClassifier")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()


## **Evaluation for RandomForestClassifier**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# For RandomForestClassifier
rf_best_model = random_search.best_estimator_

# Predict on test data
y_pred_rf = rf_best_model.predict(X_test)

# Evaluate the RandomForest model performance
print("\nRandomForest Model Evaluation:")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

# Plot confusion matrix for RandomForest
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix_rf, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("Confusion Matrix - RandomForestClassifier")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()
