# Declare all required dependencies and import data that will required to build model upon.

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')
campaign.describe

<bound method NDFrame.describe of        age          job  marital            education  default housing loan  \
0       56    housemaid  married             basic.4y       no      no   no   
1       57     services  married          high.school  unknown      no   no   
2       37     services  married          high.school       no     yes   no   
3       40       admin.  married             basic.6y       no      no   no   
4       56     services  married          high.school       no      no  yes   
...    ...          ...      ...                  ...      ...     ...  ...   
37064   73      retired  married  professional.course       no     yes   no   
37065   46  blue-collar  married  professional.course       no      no   no   
37066   56      retired  married    university.degree       no     yes   no   
37067   44   technician  married  professional.course       no      no   no   
37068   74      retired  married  professional.course       no     yes   no   

         contact 

# Prepare data

In [38]:
training, testing = train_test_split(campaign, test_size=0.2, random_state=42)

X_train = training.drop('y', axis=1)
y_train = training['y'].map({'yes': 1, 'no': 0})

X_test = testing.drop('y', axis=1)
y_test = testing['y'].map({'yes': 1, 'no': 0})

numerics = ['age','cons.conf.idx']
time = ['month']

ctMinMax = ColumnTransformer(
    transformers=[
        ('confidentMax', MinMaxScaler(), numerics),
        ('time', OneHotEncoder(),time)
        ])

In [39]:
# Train the Decision Tree model
dtc = DecisionTreeClassifier(max_depth=5, random_state=42)

# Create a pipeline for the model
Model = Pipeline(steps=[
    ('preprocessor', ctMinMax),
    ('classifier', dtc)
])

# Fit the model on the training data
Model.fit(X_train, y_train)

In [40]:
y_train_pred = Model.predict(X_train)
y_train_pred

array([0, 0, 0, ..., 0, 0, 0])

In [41]:
# Evaluate the model on the training data
print("Training accuracy:", accuracy_score(y_train, y_train_pred))
print("Training classification report:\n", classification_report(y_train, y_train_pred))
print("Training confusion matrix:\n", confusion_matrix(y_train, y_train_pred))

# Make predictions on the test data
y_test_pred = Model.predict(X_test)

# Evaluate the model on the test data
print("\nTest accuracy:", accuracy_score(y_test, y_test_pred))
print("Test classification report:\n", classification_report(y_test, y_test_pred))
print("Test confusion matrix:\n", confusion_matrix(y_test, y_test_pred))
print("First 10 test predictions:", y_test_pred[:10])

Training accuracy: 0.8922610015174507
Training classification report:
               precision    recall  f1-score   support

           0       0.90      0.99      0.94     26333
           1       0.61      0.10      0.18      3322

    accuracy                           0.89     29655
   macro avg       0.76      0.55      0.56     29655
weighted avg       0.87      0.89      0.86     29655

Training confusion matrix:
 [[26115   218]
 [ 2977   345]]

Test accuracy: 0.8864310763420555
Test classification report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94      6528
           1       0.63      0.12      0.20       886

    accuracy                           0.89      7414
   macro avg       0.76      0.56      0.57      7414
weighted avg       0.86      0.89      0.85      7414

Test confusion matrix:
 [[6465   63]
 [ 779  107]]
First 10 test predictions: [0 0 0 0 0 0 0 0 0 0]


In [10]:
from google.colab import files
# Load the holdout dataset
holdout_data = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test.csv')

# Make predictions using the trained model
holdout_predictions = Model.predict(holdout_data)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame(holdout_predictions, columns=['predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

# Download the CSV file
files.download('predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>