In [1]:
# Data Processing
import numpy as np

train_data = np.genfromtxt('trainData.csv', delimiter=',', dtype=str)
m = train_data.shape[0] - 1
d = train_data.shape[1]
print(m, d)
train_input, train_output = train_data[1:,0:d-1].tolist(), train_data[1:,d-1].tolist()

24712 20


The above step uses `numpy`'s `genfromtxt` function since there are no missing values. Below I randomly print to get the actual types of data like textual information and specific fields

In [2]:
print([train_input[0], train_output[0]])

[['28', '"admin."', '"single"', '"university.degree"', '"no"', '"yes"', '"no"', '"cellular"', '"aug"', '"mon"', '1', '999', '0', '"nonexistent"', '-2.9', '92.201', '-31.4', '0.861', '5076.2'], '1']


#### Preprocessing step 1: 
Remove quotes in textual information

#### Preprocessing step 2:
Convert numeric information to numbers as opposed to within single quotes. These fields are `0, 10, 11, 12, 14, ..., 19`.

In [3]:
for i in range(0, m-1):
    for j in range(0, d-1):
        if train_input[i][j].find("\"") != -1:
            train_input[i][j] = train_input[i][j][1:-1] # removing double quotes in pairs
        else:
            train_input[i][j] = float(train_input[i][j])
    train_output[i] = float(train_output[i])
train_output = np.array(train_output).astype(float)

print(train_input[0])
print(train_output[0])

[28.0, 'admin.', 'single', 'university.degree', 'no', 'yes', 'no', 'cellular', 'aug', 'mon', 1.0, 999.0, 0.0, 'nonexistent', -2.9, 92.201, -31.4, 0.861, 5076.2]
1.0


Now that the preprocessing has been completed, it is time to separate the input and outputs and begin training. Clearly, we might have to go with Ensemble methods since the data has both textual and numeric information in it.

In [4]:
rand_index = np.random.randint(low=0, high=m)
print(train_input[rand_index])
print(train_output[rand_index])

[35.0, 'blue-collar', 'married', 'basic.9y', 'no', 'no', 'no', 'cellular', 'may', 'fri', 1.0, 999.0, 0.0, 'nonexistent', -1.8, 92.893, -46.2, 1.313, 5099.1]
0.0


Now that I have the data through a simple level of preprocessing, I am going to convert text data into numeric values. For this I am first going to tokenize and then merge this into the actual training input.

#### Preprocessing step 3:
Convert the text data into numbers using `LabelEncoder`.

#### Preprocessing step 4:
Normalize the data.

In [5]:
train_text_data = np.hstack((np.array(train_input)[:,1:10], np.array(train_input)[:,13:14]))
print('Size of training text data : {0}'.format(train_text_data.shape))

# Converting the textual data into one vector per training input
train_text_vectors = []

from sklearn.preprocessing import LabelEncoder, normalize

for j in range(0, train_text_data.shape[1]):
    lbl_enc = LabelEncoder()
    train_text_vectors.append(lbl_enc.fit_transform(train_text_data[:,j]))
train_text_vectors = np.array(train_text_vectors).T

print('Size of training text data: {0}'.format(train_text_vectors.shape))
print('Text: {0} --> Data: {1}'.format(train_text_data[0], train_text_vectors[0]))
print('Text: {0} --> Data: {1}'.format(train_text_data[10], train_text_vectors[10]))

Size of training text data : (24712, 10)
Size of training text data: (24712, 10)
Text: ['admin.' 'single' 'university.degree' 'no' 'yes' 'no' 'cellular' 'aug'
 'mon' 'nonexistent'] --> Data: [1 3 7 1 3 1 1 2 2 2]
Text: ['blue-collar' 'single' 'basic.9y' 'no' 'yes' 'no' 'cellular' 'jul' 'tue'
 'nonexistent'] --> Data: [2 3 3 1 3 1 1 4 4 2]


In [6]:
train_input = np.hstack((np.array(train_input)[:,0:1], 
                         train_text_vectors[:,0:train_text_vectors.shape[1] - 1], 
                         np.array(train_input)[:,10:13],
                         train_text_vectors[:,train_text_vectors.shape[1] - 1].reshape(-1, 1),
                         np.array(train_input)[:,14:]
                        )).astype(float)
train_input = normalize(train_input)

print(train_input.shape)
print('Vector = {0}: \nNorm = {1}'.format(train_input[0], np.linalg.norm(train_input[0])))

(24712, 19)
Vector = [  5.41107795e-03   1.93252784e-04   5.79758352e-04   1.35276949e-03
   1.93252784e-04   5.79758352e-04   1.93252784e-04   1.93252784e-04
   3.86505568e-04   3.86505568e-04   1.93252784e-04   1.93059531e-01
   0.00000000e+00   3.86505568e-04  -5.60433073e-04   1.78180999e-02
  -6.06813741e-03   1.66390647e-04   9.80989782e-01]: 
Norm = 0.9999999999999999


Now the preprocessing steps are completed. I will try my first attempt of classification using a support vector machine with Gaussian Kernel.

### Attempt 1: SVM with Gaussian Kernel

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

g_svm = SVC(kernel='rbf', cache_size=2000)
g_svm.fit(train_input, train_output)

print("Accuracy on Gaussian Kernelized SVM: {0}%".format(100*g_svm.score(train_input, train_output)))
train_predict = g_svm.predict(train_input)
print(set(train_predict))
print(set(train_output))
print(confusion_matrix(train_output, train_predict))

Accuracy on Gaussian Kernelized SVM: 88.73421819359017%
{0.0}
{0.0, 1.0}
[[21928     0]
 [ 2784     0]]


Now it is pretty obvious from the confusion matrix that this is a class imbalance dataset. 