Importing relevant libraries that will be needed

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectPercentile
import pickle

Defining our function that preprocess the input data before feeding it to the model.


The function returns testing set divided comprise of inputs and labels

In [None]:
#We assume that the input data has the same characteristics like our training model
#Which means that the data contains duration and y features. 
#We assign y to a target class that we will use to compare it to our predicted class
#The function takes as input our original data used to build the model and the unseen data as new_data

def preprocessing(original_data, new_data):
    df = pd.read_csv(original_data)
    #df_new = pd.read_csv(new_data)
    testing=pd.read_csv(new_data)
    testing.loc[df['y']=='yes','y']=1
    testing.loc[df['y']=='no','y']=0
    testing = testing.sample(frac=1)
    target_test = testing['y']
    y_test = target_test.values
    df_new = testing
    #Getting rid of the column Unnamed: 0
    df1=df.loc[:,'age':'y']
    df1_new = df_new.loc[:,'age':'poutcome']
    #Shuffle the data to avoid that it is ordered by class
    df1=df1.sample(frac=1)
    #df1_new = df1_new.sample(frac=1)
    #Converting the target value to numeric
    df1.loc[df1['y']=='yes','y']=1
    df1.loc[df1['y']=='no','y']=0
    #Removing default and duration features
    df2 = df1.drop(columns='default')
    df2 = df2.drop(columns='duration')
    df2_new = df1_new.drop(columns='default')
    df2_new = df2_new.drop(columns='duration')
    #Drop the columns with balance over 28000
    df2 = df2[df2['balance']<28000]
    #Apply get dummies on categorical variable
    #print('Features before get_dummies: ')
    #print(list(df2.columns))
    df2_dummies = pd.get_dummies(df2)
    df2_new_dummies = pd.get_dummies(df2_new)
    #print('Features after get_dummies: ')
    #print(list(df2_dummies.columns))
    #print(len(list(df2_dummies.columns)))
    #split my data into data and target
    target = df2_dummies['y']
    data = df2_dummies.drop(columns='y')
    #Extract Numpy arrays
    X = data.values
    y = target.values
    X1 = df2_new_dummies.values
    #Print the shape of our data and target
    #print(X.shape)
    #print(y.shape)
    #Divide our data into training set and testing set
    X_train,y_train= X,y 
    X_test = X1
    #Using select percentile
    select = SelectPercentile(percentile=30)
    select.fit(X_train,y_train)
    X_train_selected = select.transform(X_train)
    X_test_selected = select.transform(X_test)
    #standardize our training data and testing data
    scaler = StandardScaler()
    scaler.fit(X_train_selected)
    #X_train_scaled = scaler.transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    return X_test_scaled,y_test

Feeding the model with our new dataset and getting results

In [None]:
#Loading our model
#If you do not have our saved model within the same folder with the notebook, please input the exact location
loaded_model = pickle.load(open('best_model.sav', 'rb'))

#Getting our preprocessed data and target class such as data_20319681.csv or its exact location without the quoting mark
print('Input your file name ')
filename=input()

#We suppose that the data we used to train the model('data_20319681.csv') is within the same folder as this notebook 
#If otherwise, kindly replace it with the exact location

input_data,target_class = preprocessing('data_20319681.csv',filename)

#Use our model to predict and print a classification report and a confusion matrix to check better understand the results
#Pred stand for predicted data
pred = loaded_model.predict(input_data)
print('Confusion matrix')
print(confusion_matrix(target_class,pred))
print('Classification report')
print(classification_report(target_class,pred))