In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [7]:
# import training and test dataset
train_data = pd.read_csv('trainingdata.txt', sep=";", header=None)
train_data = train_data.rename(columns=train_data.iloc[0]).loc[1:]

test_data = pd.read_csv('testdata.txt', sep=";", header=None)
test_data = test_data.rename(columns=test_data.iloc[0]).loc[1:]

In [8]:
# convert data type
convert_dict = {'age': int,
                'balance': int,
                'day': int,
                'duration': int,
                'campaign': int,
                'pdays': int,
                'previous': int,
                }
train_data = train_data.astype(convert_dict)
test_data = test_data.astype(convert_dict)

In [9]:
# training data pre-processing

# remove columns
train_data = train_data.drop(columns=['age', 'day', 'month', 'contact'])

# binary columns processing
train_data['default'].replace(['no', 'yes'],[0, 1], inplace=True)
train_data['housing'].replace(['no', 'yes'],[0, 1], inplace=True)
train_data['loan'].replace(['no', 'yes'],[0, 1], inplace=True)
train_data['y'].replace(['no', 'yes'],[0, 1], inplace=True)

# one hot encoding categorical columns
train_data = pd.get_dummies(train_data, columns=['job', 'marital', 'education', 'poutcome'], \
                            prefix=['job', 'marital', 'edu', 'poutcome'])


# remove outliers for balance, duration, and previous columns using absolute z score
train_data = train_data[(np.abs(stats.zscore(train_data.balance)) < 3)]
train_data = train_data[(np.abs(stats.zscore(train_data.duration)) < 3)]
train_data = train_data[(np.abs(stats.zscore(train_data.campaign)) < 3)]
#     df = df[(np.abs(stats.zscore(df.previous)) < 3)]

# oversampling using SMOTE
x_train = train_data.drop(columns = ['y'])
y_train = train_data['y']
sm = SMOTE()
x_train_oversampled, y_train_oversampled = sm.fit_resample(x_train, y_train)
train_data_oversampled = pd.concat([x_train_oversampled, y_train_oversampled], axis=1)

train_data_processed = train_data_oversampled

In [10]:
# test data pre-processing

# remove columns
test_data = test_data.drop(columns=['age', 'day', 'month', 'contact'])

# binary columns processing
test_data['default'].replace(['no', 'yes'],[0, 1], inplace=True)
test_data['housing'].replace(['no', 'yes'],[0, 1], inplace=True)
test_data['loan'].replace(['no', 'yes'],[0, 1], inplace=True)
test_data['y'].replace(['no', 'yes'],[0, 1], inplace=True)

# one hot encoding categorical columns
test_data = pd.get_dummies(test_data, columns=['job', 'marital', 'education', 'poutcome'], \
                            prefix=['job', 'marital', 'edu', 'poutcome'])

test_data_processed = test_data

In [12]:
train_data_processed

Unnamed: 0,default,balance,housing,loan,duration,campaign,pdays,previous,job_admin.,job_blue-collar,...,marital_single,edu_primary,edu_secondary,edu_tertiary,edu_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
0,0,106,1,0,388,2,-1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,407,1,0,67,12,-1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
2,0,952,1,0,62,1,-1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
3,0,364,0,0,306,2,-1,0,0,0,...,1,0,0,1,0,0,0,0,1,1
4,0,-703,1,0,123,3,-1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61035,0,589,0,0,501,1,-1,0,0,0,...,1,0,0,0,0,0,0,0,1,1
61036,0,43,0,0,230,1,-1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
61037,0,1975,0,0,290,1,-1,0,0,0,...,0,0,1,0,0,0,0,0,1,1
61038,0,3,1,0,938,3,-1,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [13]:
test_data_processed

Unnamed: 0,default,balance,housing,loan,duration,campaign,pdays,previous,y,job_admin.,...,marital_married,marital_single,edu_primary,edu_secondary,edu_tertiary,edu_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
1,0,468,1,0,220,1,-1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,10215,0,0,139,2,-1,0,0,0,...,0,1,0,0,1,0,0,0,0,1
3,0,900,1,0,213,1,-1,0,0,0,...,1,0,1,0,0,0,0,0,0,1
4,0,1231,0,0,21,3,-1,0,0,0,...,1,0,1,0,0,0,0,0,0,1
5,0,5301,0,0,937,2,-1,0,1,0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9039,0,1,1,1,162,3,-1,0,0,0,...,0,1,0,1,0,0,0,0,0,1
9040,0,175,1,0,50,2,364,2,0,0,...,1,0,0,1,0,0,1,0,0,0
9041,0,1873,1,0,192,1,-1,0,0,0,...,1,0,1,0,0,0,0,0,0,1
9042,0,61,1,0,182,8,-1,0,0,0,...,0,1,0,1,0,0,0,0,0,1
