In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('data/train.csv')
df.dtypes

id            int64
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object

In [None]:
df.isnull().sum()
df.dtypes
df['job'].value_counts(normalize=True)
# df.corr(method='spearman', numeric_only=True)

In [None]:
corr_matrix = df.corr(numeric_only=True)
# sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
# plt.show()

# print(corr_matrix)

sns.pairplot(df, diag_kind="kde")
plt.show()

##### Note
Calculates both pearson and spearman correlation between all the columns -> 'duration' have the most correlation with the target value(y) (the value should be close to -1 or 1)
Now find plot a graph between duration and y to find outlier

In [None]:
sns.countplot(x='duration', data=df)
plt.show()

In [None]:
Q1 = df['duration'].quantile(0.25)  # 25th percentile
Q3 = df['duration'].quantile(0.75)  # 75th percentile
IQR = Q3 - Q1

print("Q1:", Q1, "Q3:", Q3, "IQR:", IQR)

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound:", lower_bound, "Upper Bound:", upper_bound)

In [None]:
df = df[(df['duration'] >= lower_bound) & (df['duration'] <= upper_bound)]

##### Finding more outlier if they exist

In [None]:
sns.scatterplot(x='age', y='balance', data=df)
plt.show()

In [None]:
# Histogram with outliers
sns.histplot(data=df, x='balance', kde=True)
plt.title('Distribution with Outliers')
plt.show()

# Violin plot (combines box plot and density)
# sns.violinplot(data=df, y='balance')
# plt.show()

In [None]:
sns.pairplot(df, diag_kind='hist')
plt.suptitle('Pair Plot for Outlier Detection', y=1.02)
plt.show()

In [None]:
sns.boxplot(x=df.balance, data=df)
# sns.boxplot(x=df.balance)
plt.title('Box Plot to Detect Outliers')
plt.show()


In [None]:
def find_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Find outliers
outliers, lower, upper = find_outliers_iqr(df, 'balance')

# Visualize with highlighted outliers
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x=df.index, y='balance', alpha=0.6)
sns.scatterplot(data=outliers, x=outliers.index, y='balance', color='red', s=100)
plt.axhline(y=lower, color='r', linestyle='--', alpha=0.7, label=f'Lower bound: {lower:.2f}')
plt.axhline(y=upper, color='r', linestyle='--', alpha=0.7, label=f'Upper bound: {upper:.2f}')
plt.legend()
plt.title('Outliers Detection using IQR Method')
plt.show()

In [None]:
from scipy import stats

def find_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    outliers = df[z_scores > threshold]
    return outliers

# Find and visualize outliers
outliers = find_outliers_zscore(df, 'balance')

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x=df.index, y='balance', alpha=0.6)
sns.scatterplot(data=outliers, x=outliers.index, y='balance', color='red', s=100)
plt.title('Outliers Detection using Z-Score Method')
plt.show()

##### Preprocessing

In [3]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1].values

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

categorical_cols = x.select_dtypes(include=['object']).columns.tolist()

ct= ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

scaler = StandardScaler()

In [5]:
x_encoded = ct.fit_transform(x)
x_scaled = scaler.fit_transform(x_encoded)

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.15, random_state=42)

### Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test, y_pred)
print("Logistic regression model score = ", roc_auc)

### Random Forest classifier model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
rf_model = RandomForestClassifier(n_estimators=300,
    max_depth=None,  # or try specific values like 10, 20
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    random_state=42)
rf_model.fit(x_train, y_train)
rf_y_pred = rf_model.predict(x_test)
rf_y_pred_prob = rf_model.predict_proba(x_test)[:, 1]

acc = accuracy_score(y_test, rf_y_pred)
print(f"Random Forest accuracy score: {acc}")

roc_auc = roc_auc_score(y_test, rf_y_pred_prob)
print(f"Random Forest ROC AUC: {roc_auc}")



### Random forest with gradient boost

In [None]:
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
estimators = [
    ('rf', RandomForestClassifier(n_estimators=200)),
    ('gb', GradientBoostingClassifier(n_estimators=200))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)
stack.fit(x_train, y_train)
stack_y_pred = stack.predict(x_test)
stack_y_pred_prob = stack.predict_proba(x_test)[:, 1]
stack_acc = accuracy_score(y_test, stack_y_pred)
stack_roc_auc = roc_auc_score(y_test, stack_y_pred_prob)
print(f"Stacking Classifier accuracy score: {stack_acc}")
print(f"Stacking Classifier ROC AUC: {stack_roc_auc}")

### Gradient boosting classifier

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)
gb_model.fit(x_train, y_train)

gb_y_pred_prob = gb_model.predict_proba(x_test)[:, 1]
gb_auc = roc_auc_score(y_test, gb_y_pred_prob)
print(f"Gradient Boosting ROC AUC: {gb_auc:.4f}")

NameError: name 'roc_auc_score' is not defined

### RF + LigthGBM ensemble

In [None]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

# --- Base models ---
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# --- Stacking ensemble ---
stack_model = StackingClassifier(
    estimators=[
        ('rf', rf_model),
        ('lgb', lgb_model)
    ],
    final_estimator=LogisticRegression(max_iter=200),
    stack_method='predict_proba',
    n_jobs=-1
)

# --- Train ---
stack_model.fit(x_train, y_train)

# --- Predict probabilities for ROC AUC ---
stack_pred_prob = stack_model.predict_proba(x_test)[:, 1]
auc = roc_auc_score(y_test, stack_pred_prob)

print(f"Hybrid RF + LightGBM ROC AUC: {auc:.4f}")


### Naive bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
nb_y_pred = nb_model.predict(x_test)
nb_acc = roc_auc_score(y_test, nb_y_pred)
print(f"Naive Bayes ROC AUC: {nb_acc}")

nb_pred_prob = nb_model.predict_proba(x_test)[:, 1]
nb_acc = roc_auc_score(y_test, nb_pred_prob)
print(f"Naive Bayes ROC AUC: {nb_acc:.4f}")

### SVM

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score

svm_model = LinearSVC(max_iter=1000, tol=1e-3)

svm_model.fit(x_train, y_train)
svm_y_pred = svm_model.predict(x_test)
svm_acc = roc_auc_score(y_test, svm_y_pred)
print(f"SVM ROC AUC: {svm_acc}")

svm_pred_prob = svm_model.decision_function(x_test)
svm_auc = roc_auc_score(y_test, svm_pred_prob)
print(f"SVM ROC AUC: {svm_auc:.4f}")

#### SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgdc_model = SGDClassifier(loss='hinge', max_iter=1000)
sgdc_model.fit(x_train, y_train)

sgdc_y_pred = sgdc_model.predict(x_test)

sgdc_acc = roc_auc_score(y_test, sgdc_y_pred)
print(f"SGD Classifier ROC AUC: {sgdc_acc}")

### LightGBM classifier

In [None]:
import lightgbm as lgb

x_train_lgb = lgb.Dataset(x_train, label=y_train)
x_test_lgb = lgb.Dataset(x_test, label=y_test)

params = {
    'objective': 'binary',
    'metric': 'auc',  # or 'binary_logloss', 'auc', etc.
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

lgb_model = lgb.train(params, x_train_lgb, num_boost_round=1000, valid_sets=[x_train_lgb, x_test_lgb])

y_pred_lgb = lgb_model.predict(x_test, num_iteration=lgb_model.best_iteration)
# y_pred_class = [1 if p >= 0.5 else 0 for p in y_pred_lgb]

lgb_auc = roc_auc_score(y_test, y_pred_lgb)
print(f"LightGBM ROC AUC: {lgb_auc}")

In [None]:
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)
lgb_model.fit(x_train, y_train)
lgb_pred_prob = lgb_model.predict_proba(x_test)[:, 1]
print("LightGBM ROC AUC:", roc_auc_score(y_test, lgb_pred_prob))