In [1]:
# Data Processing
import numpy as np
from sklearn.preprocessing import LabelEncoder, normalize, StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

train_data = np.genfromtxt('trainData.csv', delimiter=',', dtype=str)
m = train_data.shape[0] - 1
d = train_data.shape[1]
print(m, d)
train_input_raw, train_output = train_data[1:,0:d-1].tolist(), train_data[1:,d-1].tolist()

24712 20


The above step uses `numpy`'s `genfromtxt` function since there are no missing values. Below I randomly print to get the actual types of data like textual information and specific fields

In [2]:
print([train_input_raw[0], train_output[0]])

[['28', '"admin."', '"single"', '"university.degree"', '"no"', '"yes"', '"no"', '"cellular"', '"aug"', '"mon"', '1', '999', '0', '"nonexistent"', '-2.9', '92.201', '-31.4', '0.861', '5076.2'], '1']


#### Preprocessing step 1: 
Remove quotes in textual information

#### Preprocessing step 2:
Convert numeric information to numbers as opposed to within single quotes. These fields are `0, 10, 11, 12, 14, ..., 19`.

In [3]:
for i in range(0, m-1):
    for j in range(0, d-1):
        if train_input_raw[i][j].find("\"") != -1:
            train_input_raw[i][j] = train_input_raw[i][j][1:-1] # removing double quotes in pairs
        else:
            train_input_raw[i][j] = float(train_input_raw[i][j])
    train_output[i] = float(train_output[i])
train_output = np.array(train_output).astype(float)

print(train_input_raw[0])
print(train_output[0])

[28.0, 'admin.', 'single', 'university.degree', 'no', 'yes', 'no', 'cellular', 'aug', 'mon', 1.0, 999.0, 0.0, 'nonexistent', -2.9, 92.201, -31.4, 0.861, 5076.2]
1.0


Now that the preprocessing has been completed, it is time to separate the input and outputs and begin training. Clearly, we might have to go with Ensemble methods since the data has both textual and numeric information in it.

In [4]:
rand_index = np.random.randint(low=0, high=m)
print(train_input_raw[rand_index])
print(train_output[rand_index])

[30.0, 'services', 'single', 'high.school', 'no', 'no', 'no', 'telephone', 'jun', 'mon', 6.0, 999.0, 0.0, 'nonexistent', 1.4, 94.465, -41.8, 4.865, 5228.1]
0.0


Now that I have the data through a simple level of preprocessing, I am going to convert text data into numeric values. For this I am first going to tokenize and then merge this into the actual training input.

#### Preprocessing step 3:
Convert the text data into numbers using `LabelEncoder`.

In [5]:
train_text_data = np.hstack((np.array(train_input_raw)[:,1:10], np.array(train_input_raw)[:,13:1]))
print('Size of training text data : {0}'.format(train_text_data.shape))

# Converting the textual data into one vector per training input
train_text_vectors = []

for j in range(0, train_text_data.shape[1]):
    lbl_enc = LabelEncoder()
    train_text_vectors.append(lbl_enc.fit_transform(train_text_data[:,j]))
train_text_vectors = np.array(train_text_vectors).T

print('Size of training text data: {0}'.format(train_text_vectors.shape))
print('Text: {0} --> Data: {1}'.format(train_text_data[0], train_text_vectors[0]))
print('Text: {0} --> Data: {1}'.format(train_text_data[10], train_text_vectors[10]))

Size of training text data : (24712, 9)
Size of training text data: (24712, 9)
Text: ['admin.' 'single' 'university.degree' 'no' 'yes' 'no' 'cellular' 'aug'
 'mon'] --> Data: [1 3 7 1 3 1 1 2 2]
Text: ['blue-collar' 'single' 'basic.9y' 'no' 'yes' 'no' 'cellular' 'jul' 'tue'] --> Data: [2 3 3 1 3 1 1 4 4]


Now I am going to make five versions of the dataset
+ Unnormalized
+ Normalized
+ Feature Scaled
+ Feature Scaled + Normalized
+ One Hot Encoded text + Normalized

I use the `OneHotEncoder` to encode categorical data after encoding using label as shown [here](http://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features).

In [6]:
# Version 1
train_input_1 = np.hstack((np.array(train_input_raw)[:,0:1], 
                         train_text_vectors[:,0:train_text_vectors.shape[1] - 1], 
                         np.array(train_input_raw)[:,10:13],
                         train_text_vectors[:,train_text_vectors.shape[1] - 1].reshape(-1, 1),
                         np.array(train_input_raw)[:,14:]
                        )).astype(float)
print("Version 1: Shape {0}".format(train_input_1.shape))
print('Vector = {0}: \nNorm = {1}'.format(train_input_1[rand_index], np.linalg.norm(train_input_1[rand_index])))

# Version 2
train_input_2 = normalize(train_input_1)
print("Version 2: Shape {0}".format(train_input_2.shape))
print('Vector = {0}: \nNorm = {1}'.format(train_input_2[rand_index], np.linalg.norm(train_input_2[rand_index])))

# Version 3
mean_reduce = StandardScaler()
train_input_3 = mean_reduce.fit_transform(train_input_1)
print("Version 3: Shape {0}".format(train_input_3.shape))
print('Vector = {0}: \nNorm = {1}'.format(train_input_3[rand_index], np.linalg.norm(train_input_3[rand_index])))

# Version 4
train_input_4 = normalize(train_input_3)
print(train_input_4.shape)
print("Version 4: Shape {0}".format(train_input_4.shape))
print('Vector = {0}: \nNorm = {1}'.format(train_input_4[rand_index], np.linalg.norm(train_input_4[rand_index])))

# Version 5
ohe = OneHotEncoder()
ohe.fit(train_text_vectors)
train_text_ohe_vectors = ohe.transform(train_text_vectors).toarray()
print(train_text_ohe_vectors[0])

train_input_5 = np.hstack((np.array(train_input_raw)[:,0:1],
                         np.array(train_input_raw)[:,10:13],
                         np.array(train_input_raw)[:,14:],
                         train_text_ohe_vectors
                         )).astype(float)
train_input_5 = normalize(train_input_5)
print("Version 5: Shape {0}".format(train_input_5.shape))
print('Vector = {0}: \nNorm = {1}'.format(train_input_5[rand_index], np.linalg.norm(train_input_5[rand_index])))

train_input = (train_input_1, train_input_2, train_input_3, train_input_4, train_input_5)

Version 1: Shape (24712, 18)
Vector = [  3.00000000e+01   8.00000000e+00   3.00000000e+00   4.00000000e+00
   1.00000000e+00   1.00000000e+00   1.00000000e+00   2.00000000e+00
   5.00000000e+00   6.00000000e+00   9.99000000e+02   0.00000000e+00
   2.00000000e+00   1.40000000e+00   9.44650000e+01  -4.18000000e+01
   4.86500000e+00   5.22810000e+03]: 
Norm = 5323.794522185281
Version 2: Shape (24712, 18)
Vector = [  5.63507849e-03   1.50268760e-03   5.63507849e-04   7.51343799e-04
   1.87835950e-04   1.87835950e-04   1.87835950e-04   3.75671899e-04
   9.39179748e-04   1.12701570e-03   1.87648114e-01   0.00000000e+00
   3.75671899e-04   2.62970330e-04   1.77439230e-02  -7.85154270e-03
   9.13821895e-04   9.82025129e-01]: 
Norm = 1.0
Version 3: Shape (24712, 18)
Vector = [-0.95978953  0.90672182  1.36067936 -0.34715749 -0.51164653 -1.0879285
 -0.44949276  1.32012514 -0.09232275  1.23198408  0.19653049 -0.35249416
 -0.70956021  0.83872146  1.53859821 -0.27803682  0.71650221  0.84468804]: 
N

Now the preprocessing steps are completed. I will try my preprocessed dataset on multiple methods of classification.
For this, I will generate 5 folds (stratified) and use 5-fold cross validation to get the best method. Folds should be invariant, hence I will use `train_input_1` and its output to generate the `folds` list.

In [7]:
skf = StratifiedKFold(n_splits=5)
folds = []
for tr_split, va_split in skf.split(train_input_1, train_output):
    folds.append((tr_split, va_split))
for i in range(0, len(folds)):
    print("Fold {0}\tTrain number: {1} Validation number: {2}".format(i+1, len(folds[i][0]), len(folds[i][1])))

Fold 1	Train number: 19769 Validation number: 4943
Fold 2	Train number: 19769 Validation number: 4943
Fold 3	Train number: 19769 Validation number: 4943
Fold 4	Train number: 19770 Validation number: 4942
Fold 5	Train number: 19771 Validation number: 4941


### Attempt 1: SVM with Gaussian Kernel with 5 fold cross validation on all versions

In [8]:
for v in range(0, 5):
    accu_train = prec_train = rcll_train = auc_train = 0.0
    accu_valid = prec_valid = rcll_valid = auc_valid = 0.0
    g_svm = SVC(kernel='rbf', cache_size=2000)
    for i in range(0, len(folds)):
        print("Performing fold {0}".format(i+1))
        g_svm.fit(train_input[v][folds[i][0]], train_output[folds[i][0]])

        train_pred = g_svm.predict(train_input[v][folds[i][0]])
        valid_pred = g_svm.predict(train_input[v][folds[i][1]])

        accu_train += accuracy_score(train_output[folds[i][0]], train_pred)
        prec_train += precision_score(train_output[folds[i][0]], train_pred)
        rcll_train += recall_score(train_output[folds[i][0]], train_pred)
        auc_train += roc_auc_score(train_output[folds[i][0]], train_pred)
    
        accu_valid += accuracy_score(train_output[folds[i][1]], valid_pred)
        prec_valid += precision_score(train_output[folds[i][1]], valid_pred)
        rcll_valid += recall_score(train_output[folds[i][1]], valid_pred)
        auc_valid += roc_auc_score(train_output[folds[i][1]], valid_pred)
    
    print("Average training accuracy after {0} fold cross validation = {1}".format(len(folds), accu_train/len(folds)))
    print("Average training precision after {0} fold cross validation = {1}".format(len(folds), prec_train/len(folds)))
    print("Average training recall after {0} fold cross validation = {1}".format(len(folds), rcll_train/len(folds)))
    print("Average training AUC-ROC after {0} fold cross validation = {1}".format(len(folds), auc_train/len(folds)))

    print("Average validation accuracy after {0} fold cross validation = {1}".format(len(folds), accu_valid/len(folds)))
    print("Average validation precision after {0} fold cross validation = {1}".format(len(folds), prec_valid/len(folds)))
    print("Average validation recall after {0} fold cross validation = {1}".format(len(folds), rcll_valid/len(folds)))
    print("Average validation AUC-ROC after {0} fold cross validation = {1}\n".format(len(folds), auc_valid/len(folds)))

Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9288807240348594
Average training precision after 5 fold cross validation = 0.9586359082858735
Average training recall after 5 fold cross validation = 0.3853275332362172
Average training AUC-ROC after 5 fold cross validation = 0.6916091795254868
Average validation accuracy after 5 fold cross validation = 0.8896084549050587
Average validation precision after 5 fold cross validation = 0.5451782037500705
Average validation recall after 5 fold cross validation = 0.12679436343205508
Average validation AUC-ROC after 5 fold cross validation = 0.5566250847411499

Performing fold 1


  'precision', 'predicted', average, warn_for)


Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8873421824678063
Average training precision after 5 fold cross validation = 0.0
Average training recall after 5 fold cross validation = 0.0
Average training AUC-ROC after 5 fold cross validation = 0.5
Average validation accuracy after 5 fold cross validation = 0.8873421904486077
Average validation precision after 5 fold cross validation = 0.0
Average validation recall after 5 fold cross validation = 0.0
Average validation AUC-ROC after 5 fold cross validation = 0.5

Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9047325117042313
Average training precision after 5 fold cross validation = 0.7336908063133393
Average training recall after 5 fold cross validation = 0.24245714622000758
Average training AUC-ROC after 5 fold cross validation = 0.615636399583753
A

### Attempt 2: Random Forests with multiple number of trees and 10 fold cross validation for all versions

In [9]:
for v in range(0, 5):
    for m in range(3, 8):
        accu_train = prec_train = rcll_train = auc_train = 0.0
        accu_valid = prec_valid = rcll_valid = auc_valid = 0.0
        rnd_frst = RandomForestClassifier(n_estimators=m)
        for i in range(0, len(folds)):
            print("Performing fold {0}".format(i+1))
            rnd_frst.fit(train_input[v][folds[i][0]], train_output[folds[i][0]])

            train_pred = rnd_frst.predict(train_input[v][folds[i][0]])
            valid_pred = rnd_frst.predict(train_input[v][folds[i][1]])

            accu_train += accuracy_score(train_output[folds[i][0]], train_pred)
            prec_train += precision_score(train_output[folds[i][0]], train_pred)
            rcll_train += recall_score(train_output[folds[i][0]], train_pred)
            auc_train += roc_auc_score(train_output[folds[i][0]], train_pred)
    
            accu_valid += accuracy_score(train_output[folds[i][1]], valid_pred)
            prec_valid += precision_score(train_output[folds[i][1]], valid_pred)
            rcll_valid += recall_score(train_output[folds[i][1]], valid_pred)
            auc_valid += roc_auc_score(train_output[folds[i][1]], valid_pred)
    
        print("Average training accuracy after {0} fold cross validation = {1} with m = {2}".format(len(folds), accu_train/len(folds), m))
        print("Average training precision after {0} fold cross validation = {1} with m = {2}".format(len(folds), prec_train/len(folds), m))
        print("Average training recall after {0} fold cross validation = {1} with m = {2}".format(len(folds), rcll_train/len(folds), m))
        print("Average training AUC-ROC after {0} fold cross validation = {1} with m = {2}".format(len(folds), auc_train/len(folds), m))

        print("Average validation accuracy after {0} fold cross validation = {1} with m = {2}".format(len(folds), accu_valid/len(folds), m))
        print("Average validation precision after {0} fold cross validation = {1} with m = {2}".format(len(folds), prec_valid/len(folds), m))
        print("Average validation recall after {0} fold cross validation = {1} with m = {2}".format(len(folds), rcll_valid/len(folds), m))
        print("Average validation AUC-ROC after {0} fold cross validation = {1} with m = {2}".format(len(folds), auc_valid/len(folds), m))

Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9713195787387086 with m = 3
Average training precision after 5 fold cross validation = 0.9083892845582684 with m = 3
Average training recall after 5 fold cross validation = 0.8290229104373532 with m = 3
Average training AUC-ROC after 5 fold cross validation = 0.9092043060565217 with m = 3
Average validation accuracy after 5 fold cross validation = 0.8742315882104024 with m = 3
Average validation precision after 5 fold cross validation = 0.4204425649950732 with m = 3
Average validation recall after 5 fold cross validation = 0.30712126887359054 with m = 3
Average validation AUC-ROC after 5 fold cross validation = 0.6266773461051545 with m = 3
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9671111509567872 with m = 4
Average training precis

Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9655835568135158 with m = 4
Average training precision after 5 fold cross validation = 0.9661390842254025 with m = 4
Average training recall after 5 fold cross validation = 0.7197382136485551 with m = 4
Average training AUC-ROC after 5 fold cross validation = 0.8582672730955695 with m = 4
Average validation accuracy after 5 fold cross validation = 0.8866139369217361 with m = 4
Average validation precision after 5 fold cross validation = 0.4934955921920491 with m = 4
Average validation recall after 5 fold cross validation = 0.21012683569481938 with m = 4
Average validation AUC-ROC after 5 fold cross validation = 0.5913140099407346 with m = 4
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9798883267440676 with m = 5
Average training precision after 5 fold c

Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9775109095564597 with m = 5
Average training precision after 5 fold cross validation = 0.953428125920821 with m = 5
Average training recall after 5 fold cross validation = 0.841504338383427 with m = 5
Average training AUC-ROC after 5 fold cross validation = 0.9181413558040991 with m = 5
Average validation accuracy after 5 fold cross validation = 0.8826483487745843 with m = 5
Average validation precision after 5 fold cross validation = 0.46779346482866185 with m = 5
Average validation recall after 5 fold cross validation = 0.29921212042932976 with m = 5
Average validation AUC-ROC after 5 fold cross validation = 0.6279670834975162 with m = 5
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9742938506518775 with m = 6
Average training precision after 5 fold cr

### Attempt 3: AdaBoost with multiple weak classifiers and 10 fold cross validation

In [10]:
for v in range(0, 5):
    for m in [10, 25, 50, 100, 200, 250, 500, 1000]:
        accu_train = prec_train = rcll_train = auc_train = 0.0
        accu_valid = prec_valid = rcll_valid = auc_valid = 0.0
        adaboost = AdaBoostClassifier(n_estimators=m)
        for i in range(0, len(folds)):
            print("Performing fold {0}".format(i+1))
            adaboost.fit(train_input[v][folds[i][0]], train_output[folds[i][0]])

            train_pred = adaboost.predict(train_input[v][folds[i][0]])
            valid_pred = adaboost.predict(train_input[v][folds[i][1]])

            accu_train += accuracy_score(train_output[folds[i][0]], train_pred)
            prec_train += precision_score(train_output[folds[i][0]], train_pred)
            rcll_train += recall_score(train_output[folds[i][0]], train_pred)
            auc_train += roc_auc_score(train_output[folds[i][0]], train_pred)
    
            accu_valid += accuracy_score(train_output[folds[i][1]], valid_pred)
            prec_valid += precision_score(train_output[folds[i][1]], valid_pred)
            rcll_valid += recall_score(train_output[folds[i][1]], valid_pred)
            auc_valid += roc_auc_score(train_output[folds[i][1]], valid_pred)
    
        print("Average training accuracy after {0} fold cross validation = {1} with m = {2}".format(len(folds), accu_train/len(folds), m))
        print("Average training precision after {0} fold cross validation = {1} with m = {2}".format(len(folds), prec_train/len(folds), m))
        print("Average training recall after {0} fold cross validation = {1} with m = {2}".format(len(folds), rcll_train/len(folds), m))
        print("Average training AUC-ROC after {0} fold cross validation = {1} with m = {2}".format(len(folds), auc_train/len(folds), m))

        print("Average validation accuracy after {0} fold cross validation = {1} with m = {2}".format(len(folds), accu_valid/len(folds), m))
        print("Average validation precision after {0} fold cross validation = {1} with m = {2}".format(len(folds), prec_valid/len(folds), m))
        print("Average validation recall after {0} fold cross validation = {1} with m = {2}".format(len(folds), rcll_valid/len(folds), m))
        print("Average validation AUC-ROC after {0} fold cross validation = {1} with m = {2}".format(len(folds), auc_valid/len(folds), m))

Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8982781852674263 with m = 10
Average training precision after 5 fold cross validation = 0.6629490413129233 with m = 10
Average training recall after 5 fold cross validation = 0.19845518401146692 with m = 10
Average training AUC-ROC after 5 fold cross validation = 0.5927917543945315 with m = 10
Average validation accuracy after 5 fold cross validation = 0.8978229091675729 with m = 10
Average validation precision after 5 fold cross validation = 0.6542864991940738 with m = 10
Average validation recall after 5 fold cross validation = 0.19827828939720754 with m = 10
Average validation AUC-ROC after 5 fold cross validation = 0.5924582521936343 with m = 10
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8992999563764099 with m = 25
Average train

Average training accuracy after 5 fold cross validation = 0.8994516958162121 with m = 50
Average training precision after 5 fold cross validation = 0.6438876216435238 with m = 50
Average training recall after 5 fold cross validation = 0.24048087007906074 with m = 50
Average training AUC-ROC after 5 fold cross validation = 0.6117980445182493 with m = 50
Average validation accuracy after 5 fold cross validation = 0.8970540552028712 with m = 50
Average validation precision after 5 fold cross validation = 0.6152674219092129 with m = 50
Average validation recall after 5 fold cross validation = 0.23240187024527598 with m = 50
Average validation AUC-ROC after 5 fold cross validation = 0.6069205774384756 with m = 50
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9009286811505246 with m = 100
Average training precision after 5 fold cross validation = 0.6592904479047262 with m = 100
Average tra

Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9013636857268807 with m = 250
Average training precision after 5 fold cross validation = 0.6684641285888977 with m = 250
Average training recall after 5 fold cross validation = 0.24703633149231846 with m = 250
Average training AUC-ROC after 5 fold cross validation = 0.6157370124995213 with m = 250
Average validation accuracy after 5 fold cross validation = 0.8991178741417156 with m = 250
Average validation precision after 5 fold cross validation = 0.639031007767892 with m = 250
Average validation recall after 5 fold cross validation = 0.24137336450408792 with m = 250
Average validation AUC-ROC after 5 fold cross validation = 0.6119992498130771 with m = 250
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.901616614130613 with m = 500
Average training precisi

Average training accuracy after 5 fold cross validation = 0.9073223525417994 with m = 1000
Average training precision after 5 fold cross validation = 0.7046425450748194 with m = 1000
Average training recall after 5 fold cross validation = 0.30531658549916607 with m = 1000
Average training AUC-ROC after 5 fold cross validation = 0.6445351168950174 with m = 1000
Average validation accuracy after 5 fold cross validation = 0.8958401355471427 with m = 1000
Average validation precision after 5 fold cross validation = 0.5895327307507119 with m = 1000
Average validation recall after 5 fold cross validation = 0.24928315875127546 with m = 1000
Average validation AUC-ROC after 5 fold cross validation = 0.6136055317461169 with m = 1000
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8974283560845876 with m = 10
Average training precision after 5 fold cross validation = 0.6493250026717143 with m = 

### Attempt 4: Gaussian Naive Bayes with 5 fold cross validation on all versions

In [11]:
for v in range(0, 5):
    accu_train = prec_train = rcll_train = auc_train = 0.0
    accu_valid = prec_valid = rcll_valid = auc_valid = 0.0
    gnb = GaussianNB()
    for i in range(0, len(folds)):
        print("Performing fold {0}".format(i+1))
        gnb.fit(train_input[v][folds[i][0]], train_output[folds[i][0]])

        train_pred = gnb.predict(train_input[v][folds[i][0]])
        valid_pred = gnb.predict(train_input[v][folds[i][1]])

        accu_train += accuracy_score(train_output[folds[i][0]], train_pred)
        prec_train += precision_score(train_output[folds[i][0]], train_pred)
        rcll_train += recall_score(train_output[folds[i][0]], train_pred)
        auc_train += roc_auc_score(train_output[folds[i][0]], train_pred)
    
        accu_valid += accuracy_score(train_output[folds[i][1]], valid_pred)
        prec_valid += precision_score(train_output[folds[i][1]], valid_pred)
        rcll_valid += recall_score(train_output[folds[i][1]], valid_pred)
        auc_valid += roc_auc_score(train_output[folds[i][1]], valid_pred)
    
    print("Average training accuracy after {0} fold cross validation = {1}".format(len(folds), accu_train/len(folds)))
    print("Average training precision after {0} fold cross validation = {1}".format(len(folds), prec_train/len(folds)))
    print("Average training recall after {0} fold cross validation = {1}".format(len(folds), rcll_train/len(folds)))
    print("Average training AUC-ROC after {0} fold cross validation = {1}".format(len(folds), auc_train/len(folds)))

    print("Average validation accuracy after {0} fold cross validation = {1}".format(len(folds), accu_valid/len(folds)))
    print("Average validation precision after {0} fold cross validation = {1}".format(len(folds), prec_valid/len(folds)))
    print("Average validation recall after {0} fold cross validation = {1}".format(len(folds), rcll_valid/len(folds)))
    print("Average validation AUC-ROC after {0} fold cross validation = {1}\n".format(len(folds), auc_valid/len(folds)))

Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8243160894014796
Average training precision after 5 fold cross validation = 0.3394151279088632
Average training recall after 5 fold cross validation = 0.5911452316478278
Average training AUC-ROC after 5 fold cross validation = 0.7225324260861753
Average validation accuracy after 5 fold cross validation = 0.8242151186758637
Average validation precision after 5 fold cross validation = 0.3390336621951001
Average validation recall after 5 fold cross validation = 0.5890723686759749
Average validation AUC-ROC after 5 fold cross validation = 0.7215703778249869

Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8757789738815414
Average training precision after 5 fold cross validation = 0.44390419980927415
Average training recall after 5 fold cross 

### Attempt 5: K-NN with 5 fold cross validation on all versions and multiple choices of k

In [12]:
for v in range(0, 5):
    for k in [10, 25, 50, 100, 200, 250, 500, 1000]:
        accu_train = prec_train = rcll_train = auc_train = 0.0
        accu_valid = prec_valid = rcll_valid = auc_valid = 0.0
        knn_clf = KNeighborsClassifier(n_neighbors=k)
        for i in range(0, len(folds)):
            print("Performing fold {0}".format(i+1))
            knn_clf.fit(train_input[v][folds[i][0]], train_output[folds[i][0]])

            train_pred = knn_clf.predict(train_input[v][folds[i][0]])
            valid_pred = knn_clf.predict(train_input[v][folds[i][1]])

            accu_train += accuracy_score(train_output[folds[i][0]], train_pred)
            prec_train += precision_score(train_output[folds[i][0]], train_pred)
            rcll_train += recall_score(train_output[folds[i][0]], train_pred)
            auc_train += roc_auc_score(train_output[folds[i][0]], train_pred)
    
            accu_valid += accuracy_score(train_output[folds[i][1]], valid_pred)
            prec_valid += precision_score(train_output[folds[i][1]], valid_pred)
            rcll_valid += recall_score(train_output[folds[i][1]], valid_pred)
            auc_valid += roc_auc_score(train_output[folds[i][1]], valid_pred)
    
        print("Average training accuracy after {0} fold cross validation = {1} with k = {2}".format(len(folds), accu_train/len(folds), k))
        print("Average training precision after {0} fold cross validation = {1} with k = {2}".format(len(folds), prec_train/len(folds), k))
        print("Average training recall after {0} fold cross validation = {1} with k = {2}".format(len(folds), rcll_train/len(folds), k))
        print("Average training AUC-ROC after {0} fold cross validation = {1} with k = {2}".format(len(folds), auc_train/len(folds), k))

        print("Average validation accuracy after {0} fold cross validation = {1} with k = {2}".format(len(folds), accu_valid/len(folds), k))
        print("Average validation precision after {0} fold cross validation = {1} with k = {2}".format(len(folds), prec_valid/len(folds), k))
        print("Average validation recall after {0} fold cross validation = {1} with k = {2}".format(len(folds), rcll_valid/len(folds), k))
        print("Average validation AUC-ROC after {0} fold cross validation = {1} with k = {2}".format(len(folds), auc_valid/len(folds), k))

Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9034679653725795 with k = 10
Average training precision after 5 fold cross validation = 0.7145243535831045 with k = 10
Average training recall after 5 fold cross validation = 0.23832651182363662 with k = 10
Average training AUC-ROC after 5 fold cross validation = 0.6131207551205804 with k = 10
Average validation accuracy after 5 fold cross validation = 0.8949095020534763 with k = 10
Average validation precision after 5 fold cross validation = 0.6026689549331096 with k = 10
Average validation recall after 5 fold cross validation = 0.19827893520013432 with k = 10
Average validation AUC-ROC after 5 fold cross validation = 0.590816821843719 with k = 10
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9001699614360801 with k = 25
Average traini

Average training accuracy after 5 fold cross validation = 0.8970844149236493 with k = 50
Average training precision after 5 fold cross validation = 0.6414389645114091 with k = 50
Average training recall after 5 fold cross validation = 0.19603092131092298 with k = 50
Average training AUC-ROC after 5 fold cross validation = 0.591060883328898 with k = 50
Average validation accuracy after 5 fold cross validation = 0.8962043278315642 with k = 50
Average validation precision after 5 fold cross validation = 0.6294720080341452 with k = 50
Average validation recall after 5 fold cross validation = 0.19109373183679268 with k = 50
Average validation AUC-ROC after 5 fold cross validation = 0.5884099094678679 with k = 50
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8963863717450575 with k = 100
Average training precision after 5 fold cross validation = 0.6269101714043568 with k = 100
Average trai

  'precision', 'predicted', average, warn_for)


Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8873421824678063 with k = 1000
Average training precision after 5 fold cross validation = 0.0 with k = 1000
Average training recall after 5 fold cross validation = 0.0 with k = 1000
Average training AUC-ROC after 5 fold cross validation = 0.5 with k = 1000
Average validation accuracy after 5 fold cross validation = 0.8873421904486077 with k = 1000
Average validation precision after 5 fold cross validation = 0.0 with k = 1000
Average validation recall after 5 fold cross validation = 0.0 with k = 1000
Average validation AUC-ROC after 5 fold cross validation = 0.5 with k = 1000
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.9056834658883949 with k = 10
Average training precision after 5 fold cross validation = 0.7249215837631835 with k = 10
Average training 

Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8995933163593367 with k = 50
Average training precision after 5 fold cross validation = 0.6677019139161213 with k = 50
Average training recall after 5 fold cross validation = 0.21650572902012916 with k = 50
Average training AUC-ROC after 5 fold cross validation = 0.6014122901701718 with k = 50
Average validation accuracy after 5 fold cross validation = 0.8979848934036363 with k = 50
Average validation precision after 5 fold cross validation = 0.6470981886622231 with k = 50
Average validation recall after 5 fold cross validation = 0.2083327951642277 with k = 50
Average validation AUC-ROC after 5 fold cross validation = 0.5969382730581932 with k = 50
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.8981466498483724 with k = 100
Average training precision afte

Average training accuracy after 5 fold cross validation = 0.895880540010525 with k = 200
Average training precision after 5 fold cross validation = 0.614755686278101 with k = 200
Average training recall after 5 fold cross validation = 0.20294532822653916 with k = 200
Average training AUC-ROC after 5 fold cross validation = 0.59340079337213 with k = 200
Average validation accuracy after 5 fold cross validation = 0.8958804821810455 with k = 200
Average validation precision after 5 fold cross validation = 0.6141296514866328 with k = 200
Average validation recall after 5 fold cross validation = 0.20294421554318487 with k = 200
Average validation AUC-ROC after 5 fold cross validation = 0.5934002496098556 with k = 200
Performing fold 1
Performing fold 2
Performing fold 3
Performing fold 4
Performing fold 5
Average training accuracy after 5 fold cross validation = 0.895880540010525 with k = 250
Average training precision after 5 fold cross validation = 0.614755686278101 with k = 250
Average t

### Submission 1: Based on Gaussian Naive Bayes and version 4 of the dataset

In [13]:
test_input_raw = np.genfromtxt('testData.csv', delimiter=',', dtype=str)[1:,1:].tolist()
for i in range(0, len(test_input_raw)):
    for j in range(0, d-1):
        if test_input_raw[i][j].find("\"") != -1:
            test_input_raw[i][j] = test_input_raw[i][j][1:-1] # removing double quotes in pairs
        else:
            test_input_raw[i][j] = float(test_input_raw[i][j])

test_text_data = np.hstack((np.array(test_input_raw)[:,1:10], np.array(test_input_raw)[:,13:1]))
print('Size of testing text data : {0}'.format(test_text_data.shape))

# Converting the textual data into one vector per training input
test_text_vectors = []

for j in range(0, test_text_data.shape[1]):
    lbl_enc = LabelEncoder()
    lbl_enc.fit(train_text_data[:,j])
    test_text_vectors.append(lbl_enc.transform(test_text_data[:,j]))
test_text_vectors = np.array(test_text_vectors).T

print('Size of testing text data: {0}'.format(test_text_vectors.shape))
print('Text: {0} --> Data: {1}'.format(test_text_data[0], test_text_vectors[0]))
print('Text: {0} --> Data: {1}'.format(test_text_data[10], test_text_vectors[10]))

test_input_sub_1 = np.hstack((np.array(test_input_raw)[:,0:1], 
                         test_text_vectors[:,0:test_text_vectors.shape[1] - 1], 
                         np.array(test_input_raw)[:,10:13],
                         test_text_vectors[:,test_text_vectors.shape[1] - 1].reshape(-1, 1),
                         np.array(test_input_raw)[:,14:]
                        )).astype(float)

test_input_sub_1 = mean_reduce.transform(test_input_sub_1)
test_input_sub_1 = normalize(test_input_sub_1)
print("Submission 1: Shape {0}".format(test_input_sub_1.shape))
print('Vector = {0}: \nNorm = {1}'.format(test_input_sub_1[10], np.linalg.norm(test_input_sub_1[10])))

sub_file = open('sub_1.csv','w')
sub_file.write('Id,Class\n')
gnb = GaussianNB()
gnb.fit(train_input_4, train_output)
sub_1_preds = gnb.predict(test_input_sub_1)
for i in range(0, len(sub_1_preds)):
    sub_file.write('{0},{1}\n'.format(i+1, int(sub_1_preds[i])))
sub_file.close()

Size of testing text data : (16476, 9)
Size of testing text data: (16476, 9)
Text: ['services' 'single' 'basic.4y' 'unknown' 'yes' 'yes' 'cellular' 'may'
 'thu'] --> Data: [8 3 1 2 3 3 1 7 3]
Text: ['admin.' 'single' 'high.school' 'no' 'yes' 'no' 'telephone' 'may' 'thu'] --> Data: [1 3 4 1 3 1 2 7 3]
Submission 1: Shape (16476, 18)
Vector = [-0.17885665 -0.32391692  0.42279692 -0.10787047 -0.15898131  0.29250069
 -0.13966858  0.41019572  0.23902924  0.27118096  0.06106691 -0.1095287
  0.00222596  0.201173    0.22460245  0.27753038  0.22173972  0.10286934]: 
Norm = 1.0000000000000002


I think it is time to find out feature importances:
+ Contact will not affect a lot I guess
+ Group data in same months (Jan, Feb, ..., Nov, Dec). Now run classifiers on each one of them and test using appropriate classifer for the test data. This is some idea taken from time series data
+ Group ages. The behaviour in different age groups might be different.