
### 4. Model Selection
- **Train multiple models:**  
- **Compare performance:** 

### 5. Hyperparameter Tuning
- **Grid Search or Random Search:** 
- **Cross-validation:** 

### 6. Evaluation
- **Final evaluation:** 
- **Confusion matrix:** 

### 7. Deployment
- **Streamlit:** 
- **User interface:** 


In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import streamlit as st




import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Section 1: Data Loading and Missing Value Checks

In [2]:
# Function to load data
def load_data(file_path):
    return pd.read_csv(file_path, sep=';')

# Function to check for missing values
def check_missing_values(data):
    missing_values = data.isin(['unknown']).sum()
    missing_values = missing_values[missing_values > 0]
    print("Missing Values:")
    print(missing_values)
    return data, missing_values

# Function to calculate percentage of missing values
def calculate_missing_percentage(data_missing):
    data, missing_values = data_missing
    missing_percentage = (missing_values / len(data)) * 100
    print("\nPercentage of Missing Values:")
    print(missing_percentage)
    return data, missing_values, missing_percentage

# Function to visualize missing values
def visualize_missing_values(data_missing_percentage):
    data, missing_values, missing_percentage = data_missing_percentage
    plt.figure(figsize=(12, 6))
    sns.barplot(x=missing_values.index, y=missing_values.values)
    plt.title('Count of Missing Values by Column')
    plt.ylabel('Count of Missing Values')
    plt.xlabel('Columns')
    plt.xticks(rotation=45)
    plt.show()
    return data

# Create pipeline
pipeline = Pipeline([
    ('load_data', FunctionTransformer(lambda _: load_data('dataset/bank-additional-full.csv'), validate=False)),
    ('check_missing_values', FunctionTransformer(check_missing_values, validate=False)),
    ('calculate_missing_percentage', FunctionTransformer(calculate_missing_percentage, validate=False)),
    ('visualize_missing_values', FunctionTransformer(visualize_missing_values, validate=False))
])

data = pipeline.fit_transform(None)

Missing Values:
job           330
marital        80
education    1731
default      8597
housing       990
loan          990
dtype: int64

Percentage of Missing Values:
job           0.801204
marital       0.194231
education     4.202680
default      20.872584
housing       2.403613
loan          2.403613
dtype: float64


  plt.show()


# Section 1.1: Missing Value Analysis

In [3]:
# Function to drop the 'duration' column because it is not known before a call is performed
def drop_duration(data):
    return data.drop('duration', axis=1)

# Function to convert 'yes'/'no' target to binary
def convert_target(data):
    data['y'] = data['y'].map({'yes': 1, 'no': 0})
    return data

# Function to analyze missing values's impact and visualize
def analyze_missing_values(data, missing_values):
    fig, axes = plt.subplots(nrows=len(missing_values.index), ncols=1, figsize=(12, 8 * len(missing_values.index)))

    for ax, col in zip(axes, missing_values.index):
        missing_mask = data[col] == 'unknown'
        total_missing = missing_mask.sum()
        total_non_missing = len(data) - total_missing

        print(f"{col} (missing: {total_missing})")
        proportion = data.groupby(missing_mask)['y'].mean()
        print("Proportion of 'yes' for missing vs non-missing:")
        print(f"Missing: {proportion[True]:.2%}")
        print(f"Non-missing: {proportion[False]:.2%}")

        # Calculate percentage difference
        diff_percentage = (proportion[True] - proportion[False]) * 100
        print(f"Difference in proportion: {diff_percentage:.2f}%")
        print()

        # Visualization with seaborn this time not with the matplotlib bcs yuksel hates it
        sns.barplot(x=[f'Missing ({total_missing})', f'Non-missing ({total_non_missing})'], 
                    y=[proportion[True], proportion[False]], ax=ax)
        ax.set_title(f'Proportion of "yes" for {col}')
        ax.set_ylabel('Proportion of "yes"')
        ax.set_xlabel(col)
        ax.set_ylim(0, 1)

    plt.tight_layout()
    plt.show()
    return data

# Extract missing values
_, missing_values = check_missing_values(data)

# Create a pipeline
pipeline = Pipeline([
    ('drop_duration', FunctionTransformer(drop_duration)),
    ('convert_target', FunctionTransformer(convert_target)),
    ('analyze_missing_values', FunctionTransformer(analyze_missing_values, kw_args={'missing_values': missing_values}))
])

data = pipeline.fit_transform(data)

Missing Values:
job           330
marital        80
education    1731
default      8597
housing       990
loan          990
dtype: int64
job (missing: 330)
Proportion of 'yes' for missing vs non-missing:
Missing: 11.21%
Non-missing: 11.27%
Difference in proportion: -0.05%

marital (missing: 80)
Proportion of 'yes' for missing vs non-missing:
Missing: 15.00%
Non-missing: 11.26%
Difference in proportion: 3.74%

education (missing: 1731)
Proportion of 'yes' for missing vs non-missing:
Missing: 14.50%
Non-missing: 11.12%
Difference in proportion: 3.38%

default (missing: 8597)
Proportion of 'yes' for missing vs non-missing:
Missing: 5.15%
Non-missing: 12.88%
Difference in proportion: -7.72%

housing (missing: 990)
Proportion of 'yes' for missing vs non-missing:
Missing: 10.81%
Non-missing: 11.28%
Difference in proportion: -0.47%

loan (missing: 990)
Proportion of 'yes' for missing vs non-missing:
Missing: 10.81%
Non-missing: 11.28%
Difference in proportion: -0.47%



  plt.show()



| **Column**   | **Missing Count** | **Difference in Proportion of 'Yes'** | **Decision**                                                                                                                                          |
|---------------|-------------------|---------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
| **job**       | 39               | -0.70%                                | The small difference suggests minimal bias. Imputed missing values with the mode (`most frequent value`).                                                  |
| **marital**   | 11               | -1.86%                                | The difference is small but notable. Imputed missing values with the mode, as the missing rate and impact are low.                                         |
| **education** | 167              | +4.81%                                | Missing values appear to show higher success rates. "unknown" will be encoded as a separate category to capture this distinction.                      |
| **default**   | 803              | -6.02%                                | The larger difference and high missing rate suggest that encoding "unknown" as a separate category might better capture its relationship to the target.   |
| **housing**   | 105              | -2.44%                                | The moderate difference suggests some potential bias. However after further testing with the full dataset (where the proportion is -%0.47), this bias appears to be a result of the random %10 selected more so than the actual impact of the class. Imputed missing values with the mode. |
| **loan**      | 105              | -2.44%                                | Similar situation to **housing** with a proportion of -%0.47, imputed with the mode. *The similarity with housing has been noted for future reference*|




# Final Encoding Decisions and Reasons

| **Variable**     | **Encoding Decision**     | **Reason**                                                                 |
|------------------|---------------------------|---------------------------------------------------------------------------|
| **job**          | One-hot encoding           | "job" is a nominal categorical variable with distinct types; one-hot encoding prevents introducing any ordinal relationships. |
| **marital**      | One-hot encoding           | "marital" is a categorical variable with distinct values and no natural order. One-hot encoding handles it appropriately. |
| **education**    | One-hot encoding           | "education" includes "unknown" as its own category; one-hot encoding treats each category (including `unknown`) as a separate binary column. |
| **default**      | One-hot encoding           | "default" is a binary categorical variable with "no" and "yes" values and additionally includes "unknown" as its own category, so one-hot encoding is appropriate. |
| **housing**      | One-hot encoding           | "housing" is a binary categorical variable with "no" and "yes" values, and one-hot encoding works well here. |
| **loan**         | One-hot encoding           | "loan" is also binary, with "no" and "yes" values, and one-hot encoding is suitable. |
| **contact**      | One-hot encoding           | "contact" is a nominal categorical variable with distinct types ("cellular", "telephone"), making one-hot encoding ideal. |
| **month**        | Cyclical encoding (sin/cos) | "month" is cyclical (e.g., December follows January), so sine and cosine transformations capture its cyclical nature. The complete lack of January and February, alongside the rarity of December has been noted.  |
| **day_of_week**  | One-hot encoding           | "day_of_week" is a nominal categorical variable with distinct values (days of the week) and no inherent order, so one-hot encoding is suitable. The lack of inherent order is due to the nature of this variable relating to many different weeks as opposed to a one week period. |
| **poutcome**     | One-hot encoding           | "poutcome" is categorical with distinct values (e.g., "failure", "success"), and one-hot encoding captures the separate categories. |



In [4]:
def encode_month_sin_cos(X):
    month_map = {'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
                 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    months = X["month"]  # Access column by name
    months = months.map(month_map).fillna(0)  # Map and handle missing
    month_sin = np.sin(2 * np.pi * months / 12)
    month_cos = np.cos(2 * np.pi * months / 12)
    return pd.DataFrame({"month_sin": month_sin, "month_cos": month_cos})

def month_feature_names_out(self, input_features): #this took way too long to figure out jesus christ
    return ["month_sin", "month_cos"]

In [5]:
categorical_impute_cols = ['job', 'marital', 'housing', 'loan']
categorical_pass_cols = ['education', 'default']

onehot_cols = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 
               'day_of_week', 'poutcome', 'education']
numerical_cols = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 
                  'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# TODO for much later, pdays is one fucking stupid column
# it has 999 as a value which is a placeholder for never contacted, this might be 
# a problem for the model, if so we need to handle this, but first testing the model
#  without handling it

In [6]:
mode_imputer = SimpleImputer(strategy='most_frequent', missing_values='unknown')
month_transformer = FunctionTransformer(encode_month_sin_cos, validate=False, feature_names_out=month_feature_names_out)
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

#Impute
data[categorical_impute_cols] = mode_imputer.fit_transform(data[categorical_impute_cols])

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('month_encoding', month_transformer, ['month']), 
        ('onehot', onehot_encoder, onehot_cols),
        ('scaling', scaler, numerical_cols)
    ],
    remainder='passthrough',  # Keeps other columns as is, keeping response variable, annoying cascading side effect, the y variable is renamed to remainder__y
)

transformed_data = preprocessor.fit_transform(data)
transformed_df = pd.DataFrame(transformed_data, columns=preprocessor.get_feature_names_out())


In [7]:
print(transformed_df.head())
#print(len(transformed_df.columns))

   month_encoding__month_sin  month_encoding__month_cos  onehot__job_admin.  \
0                        0.5                  -0.866025                 0.0   
1                        0.5                  -0.866025                 0.0   
2                        0.5                  -0.866025                 0.0   
3                        0.5                  -0.866025                 1.0   
4                        0.5                  -0.866025                 0.0   

   onehot__job_blue-collar  onehot__job_entrepreneur  onehot__job_housemaid  \
0                      0.0                       0.0                    1.0   
1                      0.0                       0.0                    0.0   
2                      0.0                       0.0                    0.0   
3                      0.0                       0.0                    0.0   
4                      0.0                       0.0                    0.0   

   onehot__job_management  onehot__job_retired  on

In [8]:

# Correct column names with prefixes
numerical_cols_prefixed = [f'scaling__{col}' for col in numerical_cols]

numerical_corr = transformed_df[numerical_cols_prefixed].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(numerical_corr, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix - Numerical Features")
plt.show()


# Drop highly correlated features for some reason it lowers the f1 score
#transformed_df = transformed_df.drop(columns=['scaling__euribor3m', 'scaling__emp.var.rate'])

  plt.show()


there is definitely some redundant features, look into the red ones that are not in the diagonal, possibly (almost definitely) remove some

In [9]:
target_corr = transformed_df[numerical_cols_prefixed + ['remainder__y']].corr()['remainder__y'].drop('remainder__y')
print(target_corr.sort_values(ascending=False))


scaling__previous          0.230181
scaling__cons.conf.idx     0.054878
scaling__age               0.030399
scaling__campaign         -0.066357
scaling__cons.price.idx   -0.136211
scaling__emp.var.rate     -0.298334
scaling__euribor3m        -0.307771
scaling__pdays            -0.324914
scaling__nr.employed      -0.354678
Name: remainder__y, dtype: float64


not much to do here, no value is too low

In [10]:
# Update onehot_cols with prefixed column names
onehot_cols_prefixed = [col for col in transformed_df.columns if col.startswith('onehot__')]

#TODO experiment with different thresholds instead of 0.01
low_variance_cols = [col for col in onehot_cols_prefixed if transformed_df[col].var() < 0.01]
print("Low variance columns:", low_variance_cols)


Low variance columns: ['onehot__default_yes', 'onehot__education_illiterate']


possibly drop them motherfuckers, after consideration its or at least appears to be a trap especially for default yes

In [11]:
from scipy.stats import pointbiserialr

for col in onehot_cols_prefixed:
    corr, _ = pointbiserialr(transformed_df[col], transformed_df['remainder__y'])
    print(f"Correlation between {col} and y: {corr}")

#TODO print better, check for very low correlation to possibly remove some columns

Correlation between onehot__job_admin. and y: 0.031076522385791658
Correlation between onehot__job_blue-collar and y: -0.07442328716829617
Correlation between onehot__job_entrepreneur and y: -0.016643882021650742
Correlation between onehot__job_housemaid and y: -0.006504932342859749
Correlation between onehot__job_management and y: -0.00041886207376955913
Correlation between onehot__job_retired and y: 0.09222084296125614
Correlation between onehot__job_self-employed and y: -0.004662544896685274
Correlation between onehot__job_services and y: -0.0323008675034146
Correlation between onehot__job_student and y: 0.09395498918093821
Correlation between onehot__job_technician and y: -0.006148639356560813
Correlation between onehot__job_unemployed and y: 0.014751895572852657
Correlation between onehot__marital_divorced and y: -0.010608045255555046
Correlation between onehot__marital_married and y: -0.042965522416055044
Correlation between onehot__marital_single and y: 0.054133498708308685
Corr

possible addition, recursive feature elimination, and if all else fails then just go hail mary with a principal component analysis(this IS a lost resort)

In [12]:
X = transformed_df.drop(columns=['remainder__y'])
y = transformed_df['remainder__y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:

# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(random_state=42, max_depth=20, min_samples_split=10, n_estimators=300, n_jobs=-1, class_weight='balanced') 
#njobs for mutlithreading, class_weight for imbalanced classes which really helps f1 score
rf.fit(X_train, y_train)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)


In [14]:

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))

print("Decision Tree:")
evaluate_model(dt, X_test, y_test)
    
print("\nLogistic Regression:")
evaluate_model(log_reg, X_test, y_test)

print("\nRandom Forest:")
evaluate_model(rf, X_test, y_test)


Decision Tree:
Accuracy: 0.8464433114833697
Precision: 0.32610939112487103
Recall: 0.34051724137931033
F1 Score: 0.3331576172904586

Logistic Regression:
Accuracy: 0.8994901675163874
Precision: 0.676056338028169
Recall: 0.20689655172413793
F1 Score: 0.31683168316831684

Random Forest:
Accuracy: 0.8866229667394999
Precision: 0.4971209213051823
Recall: 0.5581896551724138
F1 Score: 0.5258883248730964


Decision Tree:
Accuracy: 0.8464433114833697
Precision: 0.32610939112487103
Recall: 0.34051724137931033
F1 Score: 0.3331576172904586

Logistic Regression:
Accuracy: 0.8994901675163874
Precision: 0.676056338028169
Recall: 0.20689655172413793
F1 Score: 0.31683168316831684

Random Forest:
Accuracy: 0.8866229667394999
Precision: 0.4971209213051823
Recall: 0.5581896551724138
F1 Score: 0.5258883248730964

random forest has hyperparameter tuning to some extent, no features have been removed other than duration 

In [15]:
print(y_train.value_counts(normalize=True))


remainder__y
0.0    0.887344
1.0    0.112656
Name: proportion, dtype: float64


In [16]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

y_prob = rf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

plt.figure()
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


  plt.show()


In [17]:
from sklearn.model_selection import GridSearchCV
'''
param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [None, 10, 20],
              'min_samples_split': [2, 5, 10]}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced'),
                           param_grid, cv=5, scoring='f1') 
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
'''

#this might take a while

'\nparam_grid = {\'n_estimators\': [100, 200, 300],\n              \'max_depth\': [None, 10, 20],\n              \'min_samples_split\': [2, 5, 10]}\n\ngrid_search = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1, class_weight=\'balanced\'),\n                           param_grid, cv=5, scoring=\'f1\') \ngrid_search.fit(X_train, y_train)\nprint("Best parameters:", grid_search.best_params_)\n'

In [18]:
from sklearn.metrics import precision_recall_curve
'''
y_prob = rf.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Find the threshold that maximizes F1
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = f1_scores.argmax()
optimal_threshold = thresholds[optimal_idx]

print("Optimal Threshold:", optimal_threshold)

# Predict with the new threshold
y_pred_optimal = (y_prob >= optimal_threshold).astype(int)
'''

'\ny_prob = rf.predict_proba(X_test)[:, 1]\nprecision, recall, thresholds = precision_recall_curve(y_test, y_prob)\n\n# Find the threshold that maximizes F1\nf1_scores = 2 * (precision * recall) / (precision + recall)\noptimal_idx = f1_scores.argmax()\noptimal_threshold = thresholds[optimal_idx]\n\nprint("Optimal Threshold:", optimal_threshold)\n\n# Predict with the new threshold\ny_pred_optimal = (y_prob >= optimal_threshold).astype(int)\n'

In [19]:
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(preprocessor, 'preprocessing_pipeline.pkl')



['preprocessing_pipeline.pkl']

# Frontend

In [21]:
# Load the trained model
model = joblib.load('random_forest_model.pkl')
preprocessor = joblib.load('preprocessing_pipeline.pkl')


# Define the main function
def main():
    st.title("Bank Marketing Prediction")
    st.write("Enter the customer's data to predict if they will subscribe to a term deposit.")

    # Collect user input for each feature
    age = st.number_input("Age", min_value=18, max_value=100, value=30)
    job = st.selectbox("Job", ["admin.", "blue-collar", "entrepreneur", "housemaid", "management", 
                               "retired", "self-employed", "services", "student", "technician", 
                               "unemployed", "unknown"])
    marital = st.selectbox("Marital Status", ["single", "married", "divorced", "unknown"])
    education = st.selectbox("Education", ["basic.4y", "basic.6y", "basic.9y", "high.school", 
                                           "illiterate", "professional.course", "university.degree", 
                                           "unknown"])
    default = st.selectbox("Credit in Default", ["no", "yes", "unknown"])
    housing = st.selectbox("Housing Loan", ["no", "yes", "unknown"])
    loan = st.selectbox("Personal Loan", ["no", "yes", "unknown"])
    contact = st.selectbox("Contact Communication Type", ["cellular", "telephone"])
    day_of_week = st.selectbox("Last Contact Day", ["mon", "tue", "wed", "thu", "fri"])
    poutcome = st.selectbox("Outcome of Previous Campaign", ["failure", "nonexistent", "success"])
    campaign = st.number_input("Number of Contacts During Campaign", min_value=1, value=1)
    pdays = st.number_input("Days Since Previous Campaign Contact", min_value=-1, value=999)
    previous = st.number_input("Number of Previous Contacts", min_value=0, value=0)
    emp_var_rate = st.number_input("Employment Variation Rate", value=0.0)
    cons_price_idx = st.number_input("Consumer Price Index", value=93.0)
    cons_conf_idx = st.number_input("Consumer Confidence Index", value=-40.0)
    euribor3m = st.number_input("Euribor 3-Month Rate", value=1.0)
    nr_employed = st.number_input("Number of Employees", value=5000.0)

    # When the user clicks "Predict"
    if st.button("Predict"):
        # Preprocess input into the model format
        input_data = np.array([age, campaign, pdays, previous, emp_var_rate, cons_price_idx, 
                               cons_conf_idx, euribor3m, nr_employed])
        # Include categorical inputs (use the same order as in your one-hot encoding step)
        categorical_data = [job, marital, default, housing, loan, contact, day_of_week, poutcome, education]

        # Convert categorical to one-hot (using your training data mapping)
        # (Implement one-hot encoding or load a transformer for this)
        # Preprocess the input data using the loaded preprocessor
        input_df = pd.DataFrame([{
            'age': age, 'job': job, 'marital': marital, 'education': education, 'default': default,
            'housing': housing, 'loan': loan, 'contact': contact, 'day_of_week': day_of_week,
            'campaign': campaign, 'pdays': pdays, 'previous': previous, 'emp.var.rate': emp_var_rate,
            'cons.price.idx': cons_price_idx, 'cons.conf.idx': cons_conf_idx, 'euribor3m': euribor3m,
            'nr.employed': nr_employed
        }])
        
        preprocessed_input = preprocessor.transform(input_df)
        final_input = preprocessed_input[0]

        # Combine numerical and categorical
        #final_input = np.concatenate((numerical_data, categorical_encoded), axis=0)

        # Get prediction
        prediction = model.predict([final_input])

        # Show result
        result = "Yes" if prediction == 1 else "No"
        st.success(f"The model predicts: {result}")

if __name__ == "__main__":
    main()

2024-12-20 18:21:14.159 
  command:

    streamlit run C:\Users\yukse\AppData\Roaming\Python\Python313\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-12-20 18:21:14.166 Session state does not function when running a script without `streamlit run`
