In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load the dataset

try:
  df = pd.read_csv('/content/drive/MyDrive/bank-additional-full.csv', sep=';', quotechar='"', encoding='utf-8', engine='python')
  print("Dataset loaded successfully. Here's the head of the dataframe:")
  print(df.head())
except FileNotFoundError:
  print("CSV not found!")

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# Handle Duplicates

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

# Analyze the target column

In [None]:
df['y']

In [None]:
print(df['y'].value_counts())

# See it as percentage
print('\nTarget variable distribution (%): ')
print(df['y'].value_counts(normalize=True) * 100)

# EDA - Categorical Features

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns

# Loop through them and print them
for col in categorical_cols:
  print(f"\n--Analysis of Column: {col}---")
  print(df[col].value_counts())

# Get a quick summary of all columns that contain "unknown" and see what percentage of each column is affected.

In [None]:
# Create summary of 'unknown' counts and percentages
unknown_counts = (df == 'unknown').sum()
unknown_percentages = 100 * unknown_counts / len(df)

# Combine them in new df for easy viewing
unknown_summary = pd.DataFrame({
    'Counts': unknown_counts,
    'Percentages': unknown_percentages
})
print(unknown_summary.sort_values(by='Percentages', ascending=False))

-> As the percentage of unknowns in 'job' and 'marital' is negligible so we can drop these rows.

-> Whereas, 'housing' and 'loan' and ditto same number of unknowns. So, we can check that they might be the same columns. In that case we can treat unknown as a seperate feature.

# Verify 'housing' and 'loan' suspicious columns

In [None]:
matching_unknowns = df[(df['housing'] == 'unknown') & (df['loan'] == 'unknown')].shape[0]
print(f"Number of rows with both 'housing' and 'loan': {matching_unknowns}")

As,this prints 990, it confirms they are the same people. This reinforces the idea that "unknown" is a specific group (e.g., "declined to answer personal finance questions") and should be kept as its own category.

Drop 'default' beacuse it is useless as only 3 people had a yes value and tons of unknown.

In [None]:
df = df.drop('default', axis=1)

In [None]:
df.columns

Drop rows from 'job' and 'marital' columns

In [None]:
df = df[df['job'] != 'unknown']
df = df[df['marital'] != 'unknown']

It first groups all the people by their job (e.g., all "blue-collar" together, all "management" together).

Find Mode: It then finds the most common education (the mode) within each job group.

For the "management" group, the mode might be university.degree.

For the "blue-collar" group, the mode might be basic.9y.

Fill: It fills the "unknown" education values only with the mode from their corresponding job group.

In [None]:
df['education'] = df['education'].replace('unknown', np.nan)

df['education'] = df.groupby('job')['education'].transform(lambda x: x.fillna(x.mode()[0]))

# EDA - Numerical Features

In [None]:
# Get a list of all numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

In [None]:
print("--- Displaying Histograms for Numerical Columns ---")

for col in numerical_columns:
  plt.figure(figsize=(8, 5)) # Create a new figure for each histogram
  df[col].hist(bins=30)
  plt.title(f"Histogram of {col}")
  plt.xlabel(col)
  plt.ylabel("Frequency")
  plt.tight_layout()
  plt.show() # Display the plot

In [None]:
print("\n--- Displaying Box Plots for Outlier Analysis ---")
# Plot box plots for a clearer view of outliers
for col in numerical_columns:
    plt.figure(figsize=(10, 3))  # Wide and short figure
    sns.boxplot(x=df[col])
    plt.title(f"Box Plot of {col}")
    plt.show()

In [None]:
# Create a new column: 0 if they were never contacted (999), 1 if they were contacted
df['previously_contacted'] = df['pdays'].apply(lambda x : 0 if x == 999 else 1)
df['pdays'] = df['pdays'].replace(999, -1)
df['pdays']

In [None]:
# Drop 'duration' (Prevent Cheating)
# We drop this because we don't know the call duration before making the call
df = df.drop('duration', axis=1)

In [None]:
df.columns

# Cap 'campaign' column Outliers (Winsorization)

In [None]:
upper_limit = df['campaign'].quantile(0.99)
print(f"Capping 'campaign' at: {upper_limit} calls")

# Apply the cap
df['campaign'] = df['campaign'].apply(lambda x: upper_limit if x > upper_limit else x)


# Encode Categoricals (One-Hot Encoding)

In [None]:
# This converts all text columns into numeric 1s and 0s
# drop_first=True helps prevent "multicollinearity" (a technical issue where columns duplicate info)

df_encoded = pd.get_dummies(df, drop_first=True)

In [None]:
df_encoded

In [None]:
print("Data Cleaning Complete!")
print(f"New Data Shape: {df_encoded.shape}")
df_encoded.head()

# Split your Data

In [None]:
# 1. Define X (Features) and y (Target)
X = df_encoded.drop('y_yes', axis=1)  # 'y_yes' is our target from One-Hot Encoding
y = df_encoded['y_yes']

# 2. Split the Data (Train = 80%, Test = 20%)
# random_state=42 ensures we get the same split every time (for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Scale the data (Standardization) i.e., Z-score scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Split and Scaled!")
print(f"Training Shape: {X_train_scaled.shape}")
print(f"Testing Shape: {X_test_scaled.shape}")

In [None]:
X_train_scaled, X_test_scaled

# Train Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

log_model = LogisticRegression(class_weight='balanced', random_state=42)
log_model.fit(X_train_scaled, y_train)

In [None]:
y_pred_log = log_model.predict(X_test_scaled)

**Baseline Model:** Logistic Regression
Strategy: We started with Logistic Regression to establish a performance baseline. We used class_weight='balanced' to address the severe class imbalance (89% "No" vs. 11% "Yes").

**Result:** The model achieved a Recall of 63% but a low Precision of 36%.

**Analysis:** This model acts like an aggressive sales manager. It successfully identifies the majority of potential buyers (High Recall) but creates significant waste by flagging many uninterested people as leads (Low Precision). While it captures revenue, the high cost of wasted calls makes it inefficient for scaling.

In [None]:
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_log))

In [None]:
# --- MODEL 2: Decision Tree ( The Non-Linear Model ) ---
tree_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
tree_model.fit(X_train_scaled, y_train) # Note: Trees work okay with scaled data too

**Challenger Model:** Decision Tree
Strategy: We tested a single Decision Tree to see if a non-linear model could capture complex patterns that Logistic Regression missed.

**Result:** The model failed, dropping to 33% Recall and 30% Precision.

**Analysis:** The single tree suffered from overfitting. It likely memorized noise in the training data rather than learning true customer signals. It missed 67% of potential buyers, making it arguably worse than a random guess for business growth. We effectively "fired" this model.

In [None]:
y_pred_tree = tree_model.predict(X_test_scaled)
print("Decision Tree Results:")
print(classification_report(y_test, y_pred_tree))

***Why Random Forest?***

**The Problem:** A single tree is unstable and prone to errors (high variance).

**The Fix:** Random Forest builds 100+ trees, each looking at a different slice of the data. It aggregates their votes ("Wisdom of the Crowd") to make a final prediction.

**Goal:** We aim to maintain the Recall of Logistic Regression (finding the buyers) while significantly boosting Precision (reducing wasted calls). This balance is critical for maximizing ROI and team efficiency.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100,
                                  max_depth=10,
                                  class_weight='balanced',
                                  random_state=42)
rf_model.fit(X_train_scaled, y_train)

***Champion Model: Random Forest***

**Strategy:** We deployed a Random Forest with 100 trees and class_weight='balanced' to stabilize predictions and improve precision over the baseline.

**Result:** The model achieved a Recall of 61% and a Precision of 40%, with an overall Accuracy of 85%.

**Analysis:** This model represents the best trade-off between "Revenue Capture" and "Operational Cost." It maintained the high recall of Logistic Regression (finding most buyers) while improving precision by 4%. This reduction in False Positives means the sales team wastes less time on uninterested leads compared to previous models.

***Performance Analysis: The "Precision Ceiling"***
**Question:** Why is the Precision capped at ~40% (and not 90%)?

**Answer:** Predicting human behavior in a cold-call scenario has an inherent "theoretical ceiling." We achieved 40% precision, which is industry-standard for this type of dataset. A 90% precision is statistically impossible with the current data features for two reasons:

**Data Limitations:** Our dataset contains only high-level demographic and campaign data (Age, Job, Previous Calls). We lack critical decision-making signals such as the client's real-time income, current debts, or immediate financial needs.

**Human Variance:** Buying decisions are often driven by emotional or external factors (e.g., "I'm in a bad mood" or "I just bought a car") that are invisible to the model.

**Conclusion:** The model has successfully optimized the controllable factors. To break the 50% barrier, we would need external third-party data enrichment (e.g., credit scores or spending habits).

In [None]:
y_pred_rf = rf_model.predict(X_test_scaled)
print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))

***Business Intelligence: Drivers of Sales***
**Objective:** Identify the primary factors that influence a customer's decision to subscribe to a term deposit.

**Key Insight:** Macroeconomics > Demographics Our Feature Importance analysis revealed that market conditions are significantly stronger predictors of sales than customer identity.

**Top Predictor:** Interest Rates (euribor3m): The single biggest driver of sales is the national interest rate. Customers are highly sensitive to financial returns; when rates are favorable, conversion skyrockets regardless of the customer's age or job.

**Secondary Predictor:** Economic Health (nr.employed): The number of employed citizens is a strong proxy for consumer confidence. Customers buy when they feel the economy is safe.

**Top Behavioral Predictor:** Persistence (pdays): Among controllable factors, "Previous Contact" is the strongest driver. Retargeting warm leads yields significantly higher ROI than cold-calling new leads.

**Strategic Recommendation: Shift marketing budget allocation from "Demographic Targeting" (calling specific ages) to "Dynamic Scheduling" (calling aggressively when interest rates are favorable).**

In [None]:
# Get feature importances
importances = rf_model.feature_importances_

# Create a Dataframe for visualization
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})
# Sort by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False).head(10)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances, palette='viridis')
plt.title('Top 10 Factors Driving Customer Decisions')
plt.xlabel('Importance Score')
plt.show()