In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
# Pretty Print
import pprint

In [2]:
# Data Preprocessing:

# Read the data
# The data is separated by semi colons.
# We need to specify the dtype=None and encoding, otherwise Numpy is going to 
# complain.
dataset = np.genfromtxt('Telemarketing Dataset/bank-additional-full.csv',
                       delimiter=";", dtype= None,
                       encoding = "UTF-8")

# The headers are in the first column
column_headers = dataset[0]

# We then remove the headers from the dataset. Which are in 
# the first row of the array
dataset = np.delete(dataset, [0], axis = 0)

# Displays the Data Set for Pandas.
pandas_dataset_display = dataset

# There are binary lables. Found in the last column
dataset_labels = dataset[:, -1]
# One liner to convert "yes" to 1 and "no" to zero
# Can't do if/ else => 1 if dataset_labels == "yes" else 0 
dataset_labels = (dataset_labels == "yes") * 1

# We remove the last column:
# We specify the last axis, -1 and apply numpy.s_ to "build up index tuples for arrays"
# dataset = np.delete(dataset, np.s_[-1:], axis = 1)
# I like this better:
dataset = np.delete(dataset, -1, axis = 1)

# Verifies that the last column has been removed
print(dataset[:, -1])
print(dataset_labels)

['5191' '5191' '5191' ... '4963.6' '4963.6' '4963.6']
[0 0 0 ... 0 0 0]


In [3]:
# Visualize the data with Pandas
pd.DataFrame(pandas_dataset_display, columns=column_headers)

Unnamed: 0,"""age""","""job""","""marital""","""education""","""default""","""housing""","""loan""","""contact""","""month""","""day_of_week""",...,"""campaign""","""pdays""","""previous""","""poutcome""","""emp.var.rate""","""cons.price.idx""","""cons.conf.idx""","""euribor3m""","""nr.employed""","""y"""
0,56,"""housemaid""","""married""","""basic.4y""","""no""","""no""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
1,57,"""services""","""married""","""high.school""","""unknown""","""no""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
2,37,"""services""","""married""","""high.school""","""no""","""yes""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
3,40,"""admin.""","""married""","""basic.6y""","""no""","""no""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
4,56,"""services""","""married""","""high.school""","""no""","""no""","""yes""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
5,45,"""services""","""married""","""basic.9y""","""unknown""","""no""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
6,59,"""admin.""","""married""","""professional.course""","""no""","""no""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
7,41,"""blue-collar""","""married""","""unknown""","""unknown""","""no""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
8,24,"""technician""","""single""","""professional.course""","""no""","""yes""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""
9,25,"""services""","""single""","""high.school""","""no""","""yes""","""no""","""telephone""","""may""","""mon""",...,1,999,0,"""nonexistent""",1.1,93.994,-36.4,4.857,5191,"""no"""


In [4]:
# Alternative Analysis from Pandas:
# https://www.datacamp.com/community/tutorials/categorical-data
pandas_csv = pd.read_csv('Telemarketing Dataset/bank-additional-full.csv',
                        delimiter=";")
print(pandas_csv.info())
pandas_csv.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
# pandas_csv.boxplot('pdays', 'emp.var.rate')
pandas_obj = pandas_csv.select_dtypes(include=['object']).copy()
pandas_obj.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent,no
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent,no
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent,no
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent,no


In [6]:
# Check for null values:

# If it's filled with zeros, then no null values.
print(pandas_obj.isnull().sum())

job            0
marital        0
education      0
default        0
housing        0
loan           0
contact        0
month          0
day_of_week    0
poutcome       0
y              0
dtype: int64


In [7]:
# Preprocessing Jobs
jobs = np.unique(dataset[:,1])
print(jobs)

['"admin."' '"blue-collar"' '"entrepreneur"' '"housemaid"' '"management"'
 '"retired"' '"self-employed"' '"services"' '"student"' '"technician"'
 '"unemployed"' '"unknown"']


In [8]:
# Replacing Values
# Note: job is the name of the column
replace_dict = { 
    'job' : {
        'unknown': 0,
        'admin.': 1,
        'blue-collar': 2,
        'entrepreneur': 3,
        'housemaid': 4,
        'management': 5,
        'services': 6
    }
}

# Use Pandas Replace operation
pandas_obj.replace(replace_dict, inplace = True)
pandas_obj.head()

# This gets all the categories into a label list.
pandas_labels = pandas_obj['education'].astype('category').cat.categories.tolist()
replace_dict['education'] = { 
    # We add + 1 because range is not inclusive.
    # zip creates a tuple. It pairs pandas_labels[0] with range[0].
    # Ex: (basic.4y, 1). Then we do that to assign the values
    k : v for k, v in zip(pandas_labels, range(1, len(pandas_labels) + 1))
}

print(replace_dict['education'])

{'basic.4y': 1, 'basic.6y': 2, 'basic.9y': 3, 'high.school': 4, 'illiterate': 5, 'professional.course': 6, 'university.degree': 7, 'unknown': 8}


In [9]:
# This will make it "more automatic" than the previous approaches
for column in pandas_obj:
    # column prints: job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, and the result
    column_labels = pandas_obj[column].astype('category').cat.categories.tolist()
    replace_dict[column] = {
        k : v for k, v in zip(column_labels, range(1, len(column_labels) + 1))
    }

# Verify:
#pp = pprint.PrettyPrinter(indent=2)
# pp.pprint(replace_dict)

# Label Encoding is the shorter approach, than the one above
# print(pandas_obj['day_of_week'].astype('category').cat.codes)

{ 'contact': {'cellular': 1, 'telephone': 2},
  'day_of_week': {'fri': 1, 'mon': 2, 'thu': 3, 'tue': 4, 'wed': 5},
  'default': {'no': 1, 'unknown': 2, 'yes': 3},
  'education': { 'basic.4y': 1,
                 'basic.6y': 2,
                 'basic.9y': 3,
                 'high.school': 4,
                 'illiterate': 5,
                 'professional.course': 6,
                 'university.degree': 7,
                 'unknown': 8},
  'housing': {'no': 1, 'unknown': 2, 'yes': 3},
  'job': { 0: 1,
           1: 2,
           2: 3,
           3: 4,
           4: 5,
           5: 6,
           6: 7,
           'retired': 8,
           'self-employed': 9,
           'student': 10,
           'technician': 11,
           'unemployed': 12},
  'loan': {'no': 1, 'unknown': 2, 'yes': 3},
  'marital': {'divorced': 1, 'married': 2, 'single': 3, 'unknown': 4},
  'month': { 'apr': 1,
             'aug': 2,
             'dec': 3,
             'jul': 4,
             'jun': 5,
             'mar

In [10]:
# This is an example, that I think it was going to work wery well for the future.
# If it contains 'US', then assign it 1, otherwise 0. 
# cat_df_flights_specific['US_code'] = np.where(cat_df_flights_specific['carrier'].str.contains('US'), 1, 0)

In [19]:
# See https://stackoverflow.com/questions/32011359/convert-categorical-data-in-pandas-dataframe

# pandas_obj.info()
pandas_columns = pandas_obj.select_dtypes(['object']).columns
# OMG, here we go again, naming variables... hahah
pandas_bear = pandas_obj.copy()

# This is what I'm talking about ;). One liner with Pandas.
pandas_bear[pandas_columns] = pandas_bear[pandas_columns].apply(lambda x: x.astype('category').cat.codes)

pandas_bear.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
0,4,1,0,0,0,0,1,6,1,1,0
1,6,1,3,1,0,0,1,6,1,1,0
2,6,1,3,0,2,0,1,6,1,1,0
3,1,1,1,0,0,0,1,6,1,1,0
4,6,1,3,0,0,2,1,6,1,1,0
