# Project 1 â€” Bank Marketing Prediction (Classification)

**Goal:** Predict whether a client will subscribe to a term deposit using the UCI Bank Marketing dataset.

**Dataset Link:** https://archive.ics.uci.edu/dataset/222/bank+marketing

### Instructions:
- Download `bank-additional-full.csv` and place it in the working directory before running the notebook.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import joblib

sns.set(style='whitegrid')

In [None]:
df = pd.read_csv('bank-additional-full.csv', sep=';')
print('Shape:', df.shape)
df.head()

In [None]:
df.info()
df.describe(include='all').T
print('\nTarget distribution:\n', df['y'].value_counts(normalize=True))

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='y', data=df)
plt.title('Term deposit subscription (y)')
plt.show()

In [None]:
df2 = df.copy()
cat_cols = df2.select_dtypes(include=['object']).columns.tolist()
cat_cols.remove('y')
le = LabelEncoder()
for c in cat_cols:
    df2[c] = le.fit_transform(df2[c].astype(str))

df2['y'] = df2['y'].map({'yes':1, 'no':0})
X = df2.drop('y', axis=1)
y = df2['y']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

In [None]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[num_cols])
X_test_scaled = scaler.transform(X_test[num_cols])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=num_cols, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=num_cols, index=X_test.index)

X_train_scaled_full = X_train.copy()
X_train_scaled_full[num_cols] = X_train_scaled_df
X_test_scaled_full = X_test.copy()
X_test_scaled_full[num_cols] = X_test_scaled_df

In [None]:
log = LogisticRegression(max_iter=1000, solver='lbfgs')
log.fit(X_train_scaled_full, y_train)
y_pred = log.predict(X_test_scaled_full)
y_proba = log.predict_proba(X_test_scaled_full)[:,1]

print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred))
print('ROC-AUC:', roc_auc_score(y_test, y_proba))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic')
plt.show()

In [None]:
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Random Forest ROC-AUC:', roc_auc_score(y_test, y_proba_rf))
print('\nClassification Report:\n', classification_report(y_test, y_pred_rf))

In [None]:
fi = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(15)
plt.figure(figsize=(8,6))
sns.barplot(x=fi.values, y=fi.index)
plt.title('Top Feature Importances - Random Forest')
plt.show()

In [None]:
joblib.dump(rf, 'rf_bank_model.joblib')
joblib.dump(scaler, 'scaler_bank.joblib')
print('Models saved successfully!')