<a href="https://colab.research.google.com/github/wmezadev/CSE-450-TEAM-4/blob/austinsChanges/High_Consumer_Confidence_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##  Trained Decision Tree Classifier for Bank Term Deposit Subscription Prediction using low/ and high consumer confidence

## Import Libraries and Read Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import math

# Read the data from the CSV file:
campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

## Prepare and Preprocess the Data


In [None]:
#We need to seperate the low from the high consumer confidence values for these two models

count = 0

campaignSorted = campaign.sort_values('cons.conf.idx', ascending=False)
campaignSorted.head()
campaignSortedMiddle = math.floor(len(campaignSorted)/2)
HighestConfidenceData = campaignSorted[:campaignSortedMiddle]
HighestConfidenceData.head()




Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
34618,29,admin.,single,university.degree,no,yes,no,cellular,oct,mon,2,999,0,nonexistent,-3.4,92.431,-26.9,0.731,5017.5,no
34547,36,management,married,basic.6y,no,yes,no,cellular,oct,tue,1,999,1,failure,-3.4,92.431,-26.9,0.737,5017.5,yes
34541,21,student,single,high.school,no,no,no,cellular,oct,mon,1,999,0,nonexistent,-3.4,92.431,-26.9,0.739,5017.5,yes
34542,51,entrepreneur,married,basic.4y,no,no,no,cellular,oct,tue,1,999,1,failure,-3.4,92.431,-26.9,0.737,5017.5,no
34543,59,housemaid,married,basic.4y,no,no,no,telephone,oct,tue,1,999,0,nonexistent,-3.4,92.431,-26.9,0.737,5017.5,no


In [None]:
# Split data 80/20 for train and test
train_data_high, test_data_high = train_test_split(HighestConfidenceData, test_size=0.2, random_state=42) 


# Prepare the training data
high_X_train = train_data_high.drop('y', axis=1)
high_y_train = train_data_high['y'].map({'yes': 1, 'no': 0})


# Prepare the test data
high_X_test = test_data_high.drop('y', axis=1)
high_y_test = test_data_high['y'].map({'yes': 1, 'no': 0})

# Preprocessing: Define column transformer
numeric_features = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)])
train_data_high.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
9840,41,services,married,basic.9y,no,yes,no,telephone,jun,wed,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
5330,38,technician,married,professional.course,no,yes,no,telephone,may,tue,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
35173,62,retired,married,university.degree,no,yes,no,cellular,dec,wed,1,6,1,success,-3.0,92.713,-33.0,0.706,5023.5,yes
7533,32,blue-collar,married,basic.6y,no,unknown,unknown,telephone,jun,tue,5,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1,no
36389,73,retired,divorced,basic.4y,unknown,yes,no,telephone,aug,mon,1,999,0,nonexistent,-1.7,94.027,-38.3,0.904,4991.6,no


## Create and train the model

In [None]:
# Train the Decision Tree model
high_model = DecisionTreeClassifier(max_depth=5, random_state=42)

# Create a pipeline for the model
model_high = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', high_model)
])

# Fit the model on the training data


model_high.fit(high_X_train, high_y_train)

## Evaluate the model's performance:

In [None]:
#Make predictions on the training data
high_y_train_pred = model_high.predict(high_X_train)


# Evaluate the model on the training data
print("Training accuracy:", accuracy_score(high_y_train, high_y_train_pred))
print("Training classification report:\n", classification_report(high_y_train, high_y_train_pred))
print("Training confusion matrix:\n", confusion_matrix(high_y_train, high_y_train_pred))


# Make predictions on the test data
high_y_test_pred = model_high.predict(high_X_test)

# Evaluate the model on the test data
print("\nTest accuracy:", accuracy_score(high_y_test, high_y_test_pred))
print("Test classification report:\n", classification_report(high_y_test, high_y_test_pred))
print("Test confusion matrix:\n", confusion_matrix(high_y_test, high_y_test_pred))
print("First 10 test predictions:", high_y_test_pred[:10])

Training accuracy: 0.8947865380724355
Training classification report:
               precision    recall  f1-score   support

           0       0.91      0.97      0.94     12816
           1       0.69      0.40      0.51      2011

    accuracy                           0.89     14827
   macro avg       0.80      0.69      0.73     14827
weighted avg       0.88      0.89      0.88     14827

Training confusion matrix:
 [[12457   359]
 [ 1201   810]]

Test accuracy: 0.8950633935797141
Test classification report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      3243
           1       0.63      0.40      0.49       464

    accuracy                           0.90      3707
   macro avg       0.77      0.68      0.72      3707
weighted avg       0.88      0.90      0.88      3707

Test confusion matrix:
 [[3132  111]
 [ 278  186]]
First 10 test predictions: [0 0 0 0 0 0 0 1 0 0]


## Test the Holdout Data and Export Results

In [None]:
from google.colab import files
# Load the holdout dataset
holdout_data = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test.csv')

# Make predictions using the trained model
holdout_predictions = model_high.predict(holdout_data)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame(holdout_predictions, columns=['predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

# Download the CSV file
files.download('predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>