In [1]:
from __future__ import print_function
from dataPreprocessing import Items
import tensorflow as tf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from astropy.table import Table
import math 

#own data preprocessing from Mondaq database
#author data of articles
#will predict who is an author or not
#a simple but powerful experiment
my_items = Items([],[])
my_items.createData()

train_items, test_items = my_items.splitTrainTest()

# Parameters
learning_rate = 0.009
training_epochs = 1200
batch_size = 100
display_step = 1
logs_path = '/tmp/tensorflow_logs/example'
beta = 0.005
# Network Parameters
n_hidden_1 = 35 # 1st layer number of features
n_hidden_2 = 25 # 2nd layer number of features
n_input = 10 # author data input 
n_classes = 2 # author classes 0 - no ; 1 - yes

# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

#model inspired by this tutorial
#https://github.com/aymericdamien/TensorFlow-Examples/blob/master/notebooks/3_NeuralNetworks/multilayer_perceptron.ipynb
#but the tutorial is generic and found that most of them are similar

# Create model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer 2 with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    ## Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_uniform([n_input, n_hidden_1], minval=-math.sqrt(6/5), maxval= math.sqrt(6/5))),
    'h2': tf.Variable(tf.random_uniform([n_hidden_1, n_hidden_2], minval=-math.sqrt(6/6), maxval= math.sqrt(6/6))),
    'out': tf.Variable(tf.random_uniform([n_hidden_2, n_classes], minval=-math.sqrt(6/7), maxval= math.sqrt(6/7)))
}
biases = {
    'b1': tf.Variable(tf.random_uniform([n_hidden_1], minval=-math.sqrt(6/5), maxval= math.sqrt(6/5))),
    'b2': tf.Variable(tf.random_uniform([n_hidden_2], minval=-math.sqrt(6/5), maxval= math.sqrt(6/5))),
    'out': tf.Variable(tf.random_uniform([n_classes], minval=-math.sqrt(6/5), maxval= math.sqrt(6/5)))
}

# Construct model and encapsulating all ops into scopes, making
# Tensorboard's Graph visualization more convenient
with tf.name_scope('Model'):
    # Model
    pred = multilayer_perceptron(x, weights, biases)
with tf.name_scope('Loss'):
    # Minimize error using cross entropy
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)
                         + beta*tf.nn.l2_loss(weights['out']))
with tf.name_scope('Adam'):
    # Gradient Descent
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Create a summary to monitor cost tensor
tf.summary.scalar("loss", cost)

# Merge all summaries into a single op
merged_summary_op = tf.summary.merge_all()

# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    # op to write logs to Tensorboard
    summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(train_items.data)/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = train_items.next(batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c, summary = sess.run([optimizer, cost, merged_summary_op], feed_dict={x: batch_x, y: batch_y})
                       # Write logs at every iteration
            summary_writer.add_summary(summary, epoch * total_batch + i)                                              
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: test_items.data, y: test_items.labels}))

def calculate_stats(labels_data):
    one_label = 0
    for label in labels_data:
        if label == 1:
            one_label += 1
    zero_label = len(labels_data) - one_label
    one_label_percentage = one_label * 100 / (one_label + zero_label)
    return one_label_percentage

def one_hot_decode(y_data):
    y_decode_data = []
    for el in y_data:
        if el[0] == 1:
            y_decode_data.append(1)
        else:
            y_decode_data.append(0)
    return y_decode_data

#creating astropy tables for machine learning comparison
a4 = ['KNN-20','SVM','GaussianNB']
b4 = []

a3 = ['Train', 'Test', 'Total']
b3 = []
c3 = []

train_labels = one_hot_decode(train_items.labels)
test_labels = one_hot_decode(test_items.labels)

train_labels_one_percentage = calculate_stats(train_labels)
test_labels_one_percentage = calculate_stats(test_labels)

total_labels = []
total_labels.extend(train_labels)
total_labels.extend(test_labels)
total_labels_one_percentage = calculate_stats(total_labels)

b3.append(train_labels_one_percentage)
b3.append(test_labels_one_percentage)
b3.append(total_labels_one_percentage)
c3.append(100-train_labels_one_percentage)
c3.append(100-test_labels_one_percentage)
c3.append(100-total_labels_one_percentage)

t3 = Table([a3, b3, c3], names=('Labels Sample', 'Label-1-Percentage', 'Label-0-Percentage'), meta={'name': 'first table'})
print('')
print('Data Binary Label Statistics')
print('')
print(t3)

neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(train_items.data, train_labels)

y_pred = neigh.predict(test_items.data)
b4.append(accuracy_score(test_labels, y_pred))

clf = svm.SVC()
clf.fit(train_items.data, train_labels)
y_pred2 = clf.predict(test_items.data)


b4.append(accuracy_score(test_labels, y_pred2))

gnb = GaussianNB()
y_pred3 = gnb.fit(train_items.data, train_labels).predict(test_items.data)
b4.append(accuracy_score(test_labels, y_pred3))

t4 = Table([a4,b4], names=('ML Algorithm', 'Prediction Accuracy'), meta={'name': 'second table'})
print('Evaluating the model for 3 ML Algorithms')
print('')
print(t4)



Please enter the file path of the authorData.csv provided: /home/teo/Desktop/Untitled Folder/experimentAuthor
Cannot open file | File I|O Error | Try Again | Error as  Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
Please enter the file path of the authorData.csv provided: /home/teo/Desktop/Untitled Folder/experimentAuthor/authorData.csv
Total number of the population:  77186
Total training number of entries:  46311
Epoch: 0001 cost= 0.081312190
Epoch no:  1
Epoch: 0002 cost= 0.044684742
Epoch no:  2
Epoch: 0003 cost= 0.038809388
Epoch no:  3
Epoch: 0004 cost= 0.036565254
Epoch no:  4
Epoch: 0005 cost= 0.035433033
Epoch no:  5
Epoch: 0006 cost= 0.034882529
Epoch no:  6
Epoch: 0007 cost= 0.034418351
Epoch no:  7
Epoch: 0008 cost= 0.034273084
Epoch no:  8
Epoch: 0009 cost= 0.034204302
Epoch no:  9
Epoch: 0010 cost= 0.033553952
Epoch no:  10
Epoch: 0011 cost= 0.033758136
Epoch no:  11
Epoch: 0012 cost= 0.033615025
Epoch no:  12
Epoch: 0013 cost

Epoch: 0176 cost= 0.028281650
Epoch no:  176
Epoch: 0177 cost= 0.028492171
Epoch no:  177
Epoch: 0178 cost= 0.028278852
Epoch no:  178
Epoch: 0179 cost= 0.028448371
Epoch no:  179
Epoch: 0180 cost= 0.029166847
Epoch no:  180
Epoch: 0181 cost= 0.028556049
Epoch no:  181
Epoch: 0182 cost= 0.028377204
Epoch no:  182
Epoch: 0183 cost= 0.028262003
Epoch no:  183
Epoch: 0184 cost= 0.028248465
Epoch no:  184
Epoch: 0185 cost= 0.028384643
Epoch no:  185
Epoch: 0186 cost= 0.028579608
Epoch no:  186
Epoch: 0187 cost= 0.028363597
Epoch no:  187
Epoch: 0188 cost= 0.028462538
Epoch no:  188
Epoch: 0189 cost= 0.028222802
Epoch no:  189
Epoch: 0190 cost= 0.028243024
Epoch no:  190
Epoch: 0191 cost= 0.028218515
Epoch no:  191
Epoch: 0192 cost= 0.028843625
Epoch no:  192
Epoch: 0193 cost= 0.028326737
Epoch no:  193
Epoch: 0194 cost= 0.028355034
Epoch no:  194
Epoch: 0195 cost= 0.028411798
Epoch no:  195
Epoch: 0196 cost= 0.028098137
Epoch no:  196
Epoch: 0197 cost= 0.028233144
Epoch no:  197
Epoch: 019

Epoch no:  358
Epoch: 0359 cost= 0.027157856
Epoch no:  359
Epoch: 0360 cost= 0.027320732
Epoch no:  360
Epoch: 0361 cost= 0.027460034
Epoch no:  361
Epoch: 0362 cost= 0.027099994
Epoch no:  362
Epoch: 0363 cost= 0.027286655
Epoch no:  363
Epoch: 0364 cost= 0.027507756
Epoch no:  364
Epoch: 0365 cost= 0.027387044
Epoch no:  365
Epoch: 0366 cost= 0.027264188
Epoch no:  366
Epoch: 0367 cost= 0.028352311
Epoch no:  367
Epoch: 0368 cost= 0.027754061
Epoch no:  368
Epoch: 0369 cost= 0.027880058
Epoch no:  369
Epoch: 0370 cost= 0.027044395
Epoch no:  370
Epoch: 0371 cost= 0.027268514
Epoch no:  371
Epoch: 0372 cost= 0.027210969
Epoch no:  372
Epoch: 0373 cost= 0.027278433
Epoch no:  373
Epoch: 0374 cost= 0.027395337
Epoch no:  374
Epoch: 0375 cost= 0.027214516
Epoch no:  375
Epoch: 0376 cost= 0.027128898
Epoch no:  376
Epoch: 0377 cost= 0.027422979
Epoch no:  377
Epoch: 0378 cost= 0.027342113
Epoch no:  378
Epoch: 0379 cost= 0.027441571
Epoch no:  379
Epoch: 0380 cost= 0.027149288
Epoch no: 

Epoch: 0542 cost= 0.026698054
Epoch no:  541
Epoch: 0543 cost= 0.027613394
Epoch no:  542
Epoch: 0544 cost= 0.026633583
Epoch no:  543
Epoch: 0545 cost= 0.026707640
Epoch no:  544
Epoch: 0546 cost= 0.026597827
Epoch no:  545
Epoch: 0547 cost= 0.026434382
Epoch no:  546
Epoch: 0548 cost= 0.026592872
Epoch no:  547
Epoch: 0549 cost= 0.027554013
Epoch no:  548
Epoch: 0550 cost= 0.028377397
Epoch no:  549
Epoch: 0551 cost= 0.027268071
Epoch no:  550
Epoch: 0552 cost= 0.026792929
Epoch no:  551
Epoch: 0553 cost= 0.026686831
Epoch no:  552
Epoch: 0554 cost= 0.026857880
Epoch no:  553
Epoch: 0555 cost= 0.026722989
Epoch no:  554
Epoch: 0556 cost= 0.026583178
Epoch no:  555
Epoch: 0557 cost= 0.026806169
Epoch no:  556
Epoch: 0558 cost= 0.027049588
Epoch no:  557
Epoch: 0559 cost= 0.026821076
Epoch no:  558
Epoch: 0560 cost= 0.026925304
Epoch no:  559
Epoch: 0561 cost= 0.026543289
Epoch no:  560
Epoch: 0562 cost= 0.026514526
Epoch no:  561
Epoch: 0563 cost= 0.027426537
Epoch no:  562
Epoch: 056

Epoch no:  723
Epoch: 0725 cost= 0.026379016
Epoch no:  724
Epoch: 0726 cost= 0.026657704
Epoch no:  725
Epoch: 0727 cost= 0.028411465
Epoch no:  726
Epoch: 0728 cost= 0.026798425
Epoch no:  727
Epoch: 0729 cost= 0.026251560
Epoch no:  728
Epoch: 0730 cost= 0.026464345
Epoch no:  729
Epoch: 0731 cost= 0.026213053
Epoch no:  730
Epoch: 0732 cost= 0.026477773
Epoch no:  731
Epoch: 0733 cost= 0.026482566
Epoch no:  732
Epoch: 0734 cost= 0.026296616
Epoch no:  733
Epoch: 0735 cost= 0.026949088
Epoch no:  734
Epoch: 0736 cost= 0.027075605
Epoch no:  735
Epoch: 0737 cost= 0.026506904
Epoch no:  736
Epoch: 0738 cost= 0.026591062
Epoch no:  737
Epoch: 0739 cost= 0.026142939
Epoch no:  738
Epoch: 0740 cost= 0.026655724
Epoch no:  739
Epoch: 0741 cost= 0.026221411
Epoch no:  740
Epoch: 0742 cost= 0.026827774
Epoch no:  741
Epoch: 0743 cost= 0.027184598
Epoch no:  742
Epoch: 0744 cost= 0.026603461
Epoch no:  743
Epoch: 0745 cost= 0.026466260
Epoch no:  744
Epoch: 0746 cost= 0.026468765
Epoch no: 

Epoch no:  906
Epoch: 0908 cost= 0.026976318
Epoch no:  907
Epoch: 0909 cost= 0.026636610
Epoch no:  908
Epoch: 0910 cost= 0.026325057
Epoch no:  909
Epoch: 0911 cost= 0.026492836
Epoch no:  910
Epoch: 0912 cost= 0.026300622
Epoch no:  911
Epoch: 0913 cost= 0.026492421
Epoch no:  912
Epoch: 0914 cost= 0.026574845
Epoch no:  913
Epoch: 0915 cost= 0.027174029
Epoch no:  914
Epoch: 0916 cost= 0.028288145
Epoch no:  915
Epoch: 0917 cost= 0.028383916
Epoch no:  916
Epoch: 0918 cost= 0.027041300
Epoch no:  917
Epoch: 0919 cost= 0.026451993
Epoch no:  918
Epoch: 0920 cost= 0.026124335
Epoch no:  919
Epoch: 0921 cost= 0.026195971
Epoch no:  920
Epoch: 0922 cost= 0.026171526
Epoch no:  921
Epoch: 0923 cost= 0.026379203
Epoch no:  922
Epoch: 0924 cost= 0.026258934
Epoch no:  923
Epoch: 0925 cost= 0.027046584
Epoch no:  924
Epoch: 0926 cost= 0.026566439
Epoch no:  925
Epoch: 0927 cost= 0.026886606
Epoch: 0928 cost= 0.027277696
Epoch no:  926
Epoch: 0929 cost= 0.028105619
Epoch no:  927
Epoch: 093

Epoch: 1089 cost= 0.026195128
Epoch no:  1087
Epoch: 1090 cost= 0.026658942
Epoch no:  1088
Epoch: 1091 cost= 0.026362068
Epoch no:  1089
Epoch: 1092 cost= 0.026763721
Epoch no:  1090
Epoch: 1093 cost= 0.028399727
Epoch no:  1091
Epoch: 1094 cost= 0.026408390
Epoch no:  1092
Epoch: 1095 cost= 0.026362439
Epoch no:  1093
Epoch: 1096 cost= 0.026228691
Epoch no:  1094
Epoch: 1097 cost= 0.026454727
Epoch no:  1095
Epoch: 1098 cost= 0.026206496
Epoch no:  1096
Epoch: 1099 cost= 0.026846598
Epoch no:  1097
Epoch: 1100 cost= 0.026744880
Epoch no:  1098
Epoch: 1101 cost= 0.026646302
Epoch no:  1099
Epoch: 1102 cost= 0.029213309
Epoch no:  1100
Epoch: 1103 cost= 0.029606638
Epoch no:  1101
Epoch: 1104 cost= 0.029176546
Epoch no:  1102
Epoch: 1105 cost= 0.026605818
Epoch no:  1103
Epoch: 1106 cost= 0.026429871
Epoch no:  1104
Epoch: 1107 cost= 0.025994451
Epoch no:  1105
Epoch: 1108 cost= 0.026168635
Epoch no:  1106
Epoch: 1109 cost= 0.026328879
Epoch no:  1107
Epoch: 1110 cost= 0.026236717
Epoc