In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
plt.style.use('fivethirtyeight')

%matplotlib inline

In [2]:
input_X = np.load("../outputs/10_user_X_Nov_11.npy")
input_y = np.load("../outputs/10_user_y_Nov_11.npy")
num_classes_for_prediction = 10

# put X in the right format
X = np.array([_ for _ in input_X])

# put y in the right format
def ohe(n):
    vec = np.zeros(num_classes_for_prediction)
    vec[n] = 1
    return vec

y = np.array([ohe(n) for n in input_y])
with open('../outputs/Nov11_char_dic.json', 'r') as fp:
    charToIndex = json.load(fp)

# constants
vocab_size = len(charToIndex)
embedding_dims = 300
tweet_size = 140
batch_size = 100
epochs = 10

In [3]:
# Model

# Placeholders for inputs
train_inputs = tf.placeholder(tf.int32, shape=[None, tweet_size], name='train_inputs')
train_labels = tf.placeholder(tf.int32, shape=[None, num_classes_for_prediction], name='train_labels')

# Create the embedding matrix
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_dims], -1.0, 1.0), name='embeddings')

# Extract the character embeddings for the tweet
embed =  tf.reshape(tf.nn.embedding_lookup(embeddings, train_inputs), [-1, tweet_size, embedding_dims, 1],
                    name='embed')

# The convolutional part
n_gram = 2
num_filters = 32
W_conv = tf.Variable(tf.truncated_normal([n_gram, embedding_dims, 1, num_filters], stddev=0.05), name="W_conv")
b_conv = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b_conv")

conv = tf.nn.conv2d(
        embed,
        W_conv, 
        strides=[1, 1, 1, 1], 
        padding="VALID", 
        name="convolution")

h = tf.nn.tanh(tf.nn.bias_add(conv, b_conv), name="relu")

pooled = tf.nn.max_pool(
        h,
        ksize=[1, h.get_shape()[1].value, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',
        name="max_pool")

# reshape and add FC
h_pool_flat = tf.reshape(pooled, [-1, num_filters], name='flatten')

W_fc = tf.Variable(tf.truncated_normal([num_filters, num_classes_for_prediction], stddev=0.05), name="W_fc")
b_fc = tf.Variable(tf.constant(0.1, shape=[num_classes_for_prediction]), name="b_fc")
fc_output = tf.nn.tanh(tf.matmul(h_pool_flat, W_fc) + b_fc, name="fc")

# cross entropy loss function
loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(
    _sentinel=None,
    labels=train_labels,
    logits=fc_output,
    dim=-1,
    name='loss'
))

preds = tf.argmax(fc_output, 1, name='predictions')
true_vals = tf.argmax(train_labels, 1, name='true_vals')
acc = tf.reduce_mean(tf.cast(tf.equal(preds, true_vals), tf.float32), name = 'acc')

# SGD optimizer
learning_rate = 0.5
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate, name='optimizer')
train_step = optimizer.minimize(loss, name='train_step')

In [4]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [5]:
# Train data
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)

cv_accuracy_val = []
cv_loss_val = []

split_num = 0
for train_index, test_index in kf.split(X):
    
    # reset the network parameters
    sess.run(init)
    
    print("Split: {}".format(split_num))
    split_num += 1
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # around 2 minutes per epoch
    for e in range(epochs):
        print("Epoch: {}".format(e))
        pairs = zip(np.split(X, indices_or_sections=np.arange(0, len(X), batch_size)), 
                                    np.split(y, indices_or_sections=np.arange(0, len(X), batch_size)))
        train_acc = []
        train_loss = []
        for batch_x, batch_y in tqdm(pairs):
            if len(batch_x)==batch_size:
                feed_dict = {train_inputs: batch_x, train_labels: batch_y}
                _, train_l, train_a = sess.run([train_step, loss, acc], feed_dict=feed_dict)
                train_acc.append(train_a)
                train_loss.append(train_l)
        
        val_acc, val_loss = sess.run([acc, loss], feed_dict={train_inputs: X_test, train_labels: y_test})
        
        print("Training accuracy: {}".format(np.mean(train_acc)))
        print("Training loss: {}".format(np.mean(train_loss)))
        print("Validation accuracy: {}".format(val_acc))
        print("Validation loss: {}".format(val_loss))
    
    cv_loss_val.append(val_loss)
    cv_accuracy_val.append(val_acc)

0it [00:00, ?it/s]

Split: 0
Epoch: 0


901it [02:14,  6.71it/s]
0it [00:00, ?it/s]

Training accuracy: 0.621757447719574
Training loss: 1.3595110177993774
Validation accuracy: 0.7850789427757263
Validation loss: 1.114120602607727
Epoch: 1


901it [02:00,  7.48it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8522024750709534
Training loss: 1.0158101320266724
Validation accuracy: 0.8150433897972107
Validation loss: 1.049970269203186
Epoch: 2


901it [02:08,  7.03it/s]
0it [00:00, ?it/s]

Training accuracy: 0.866718590259552
Training loss: 0.9897243976593018
Validation accuracy: 0.8405047655105591
Validation loss: 1.0179898738861084
Epoch: 3


901it [02:12,  6.78it/s]
0it [00:00, ?it/s]

Training accuracy: 0.872157871723175
Training loss: 0.9735020995140076
Validation accuracy: 0.8538470268249512
Validation loss: 1.0044142007827759
Epoch: 4


901it [02:06,  7.15it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8806562423706055
Training loss: 0.9659802913665771
Validation accuracy: 0.8691905736923218
Validation loss: 0.982292652130127
Epoch: 5


901it [01:59,  7.54it/s]
2it [00:00, 10.65it/s]

Training accuracy: 0.886329174041748
Training loss: 0.9555267691612244
Validation accuracy: 0.8670780658721924
Validation loss: 0.9787936806678772
Epoch: 6


901it [01:59,  7.53it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8901891112327576
Training loss: 0.9460486173629761
Validation accuracy: 0.8756393194198608
Validation loss: 0.9686955809593201
Epoch: 7


901it [01:57,  7.69it/s]
2it [00:00, 10.70it/s]

Training accuracy: 0.8898220658302307
Training loss: 0.9416287541389465
Validation accuracy: 0.8808094263076782
Validation loss: 0.9594376087188721
Epoch: 8


901it [01:57,  7.66it/s]
2it [00:00, 10.54it/s]

Training accuracy: 0.8906006217002869
Training loss: 0.9394266605377197
Validation accuracy: 0.8749166131019592
Validation loss: 0.9695035219192505
Epoch: 9


901it [01:58,  7.61it/s]


Training accuracy: 0.892892062664032
Training loss: 0.9345294833183289
Validation accuracy: 0.8778630495071411
Validation loss: 0.9682967662811279
Split: 1


2it [00:00, 11.70it/s]

Epoch: 0


901it [01:56,  7.70it/s]
0it [00:00, ?it/s]

Training accuracy: 0.5957174897193909
Training loss: 1.365028738975525
Validation accuracy: 0.7740715742111206
Validation loss: 1.120204210281372
Epoch: 1


901it [01:57,  7.66it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8433815240859985
Training loss: 1.0049628019332886
Validation accuracy: 0.8348343372344971
Validation loss: 1.0165742635726929
Epoch: 2


901it [01:57,  7.69it/s]
2it [00:00, 12.63it/s]

Training accuracy: 0.8690322637557983
Training loss: 0.9668928980827332
Validation accuracy: 0.8542361855506897
Validation loss: 0.9963545799255371
Epoch: 3


901it [01:56,  7.74it/s]
2it [00:00, 12.95it/s]

Training accuracy: 0.8775194883346558
Training loss: 0.953532338142395
Validation accuracy: 0.8615744113922119
Validation loss: 0.9811009168624878
Epoch: 4


901it [01:55,  7.78it/s]
2it [00:00, 12.51it/s]

Training accuracy: 0.8825473189353943
Training loss: 0.9502291083335876
Validation accuracy: 0.8680230975151062
Validation loss: 0.975006103515625
Epoch: 5


901it [01:56,  7.74it/s]
2it [00:00, 13.28it/s]

Training accuracy: 0.8909121155738831
Training loss: 0.943412184715271
Validation accuracy: 0.8769179582595825
Validation loss: 0.9732341170310974
Epoch: 6


901it [01:56,  7.75it/s]
2it [00:00, 12.40it/s]

Training accuracy: 0.8919243812561035
Training loss: 0.9380257725715637
Validation accuracy: 0.8795307874679565
Validation loss: 0.9605904817581177
Epoch: 7


901it [02:00,  7.47it/s]
2it [00:00, 12.85it/s]

Training accuracy: 0.892625093460083
Training loss: 0.9334219694137573
Validation accuracy: 0.8793084025382996
Validation loss: 0.9618098139762878
Epoch: 8


901it [01:56,  7.74it/s]
2it [00:00, 13.53it/s]

Training accuracy: 0.894137978553772
Training loss: 0.9310691356658936
Validation accuracy: 0.8808094263076782
Validation loss: 0.9579906463623047
Epoch: 9


901it [01:56,  7.75it/s]


Training accuracy: 0.8956395983695984
Training loss: 0.9282199144363403
Validation accuracy: 0.8900933861732483
Validation loss: 0.9409518241882324
Split: 2


2it [00:00, 13.78it/s]

Epoch: 0


901it [01:55,  7.81it/s]
2it [00:00, 12.83it/s]

Training accuracy: 0.5846050977706909
Training loss: 1.3852206468582153
Validation accuracy: 0.7433702349662781
Validation loss: 1.1890841722488403
Epoch: 1


901it [01:55,  7.78it/s]
2it [00:00, 11.43it/s]

Training accuracy: 0.8218130469322205
Training loss: 1.0470205545425415
Validation accuracy: 0.7858453392982483
Validation loss: 1.1201415061950684
Epoch: 2


901it [01:55,  7.79it/s]
2it [00:00, 13.00it/s]

Training accuracy: 0.836507260799408
Training loss: 1.00812828540802
Validation accuracy: 0.8092511296272278
Validation loss: 1.0638507604599
Epoch: 3


901it [01:55,  7.81it/s]
2it [00:00, 11.70it/s]

Training accuracy: 0.8465072512626648
Training loss: 0.9890084862709045
Validation accuracy: 0.8220381140708923
Validation loss: 1.0325642824172974
Epoch: 4


901it [01:56,  7.74it/s]
2it [00:00, 12.54it/s]

Training accuracy: 0.8492324352264404
Training loss: 0.988732635974884
Validation accuracy: 0.8076388239860535
Validation loss: 1.0451560020446777
Epoch: 5


901it [02:01,  7.43it/s]
2it [00:00, 10.59it/s]

Training accuracy: 0.8485984802246094
Training loss: 0.9890775084495544
Validation accuracy: 0.8162561655044556
Validation loss: 1.0373412370681763
Epoch: 6


901it [01:56,  7.70it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8563514947891235
Training loss: 0.9797482490539551
Validation accuracy: 0.8374937176704407
Validation loss: 0.9956410527229309
Epoch: 7


901it [01:58,  7.62it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8653948903083801
Training loss: 0.9754776358604431
Validation accuracy: 0.8495579957962036
Validation loss: 1.0011910200119019
Epoch: 8


901it [01:57,  7.64it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8759732842445374
Training loss: 0.9688833951950073
Validation accuracy: 0.8734085559844971
Validation loss: 0.9849368333816528
Epoch: 9


901it [02:20,  6.41it/s]


Training accuracy: 0.881145715713501
Training loss: 0.962029755115509
Validation accuracy: 0.864791214466095
Validation loss: 0.9859438538551331
Split: 3


2it [00:00, 11.58it/s]

Epoch: 0


901it [02:07,  7.09it/s]
0it [00:00, ?it/s]

Training accuracy: 0.61815345287323
Training loss: 1.3470797538757324
Validation accuracy: 0.7785623073577881
Validation loss: 1.1477216482162476
Epoch: 1


901it [02:18,  6.49it/s]
2it [00:00, 12.38it/s]

Training accuracy: 0.838620662689209
Training loss: 1.0034639835357666
Validation accuracy: 0.8261522054672241
Validation loss: 1.0127010345458984
Epoch: 2


901it [02:05,  7.16it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8554727435112
Training loss: 0.9653735756874084
Validation accuracy: 0.837215781211853
Validation loss: 0.994674563407898
Epoch: 3


901it [02:00,  7.48it/s]
2it [00:00, 11.80it/s]

Training accuracy: 0.8590100407600403
Training loss: 0.9565171003341675
Validation accuracy: 0.8516150712966919
Validation loss: 0.9739634990692139
Epoch: 4


901it [01:57,  7.65it/s]
2it [00:00, 10.26it/s]

Training accuracy: 0.8605005741119385
Training loss: 0.9489232301712036
Validation accuracy: 0.8548951745033264
Validation loss: 0.9661468863487244
Epoch: 5


901it [01:58,  7.61it/s]
2it [00:00, 10.28it/s]

Training accuracy: 0.8614126443862915
Training loss: 0.9481722712516785
Validation accuracy: 0.8468894362449646
Validation loss: 0.9727153182029724
Epoch: 6


901it [01:58,  7.63it/s]
2it [00:00, 11.38it/s]

Training accuracy: 0.8613458871841431
Training loss: 0.9432447552680969
Validation accuracy: 0.8517818450927734
Validation loss: 0.958952784538269
Epoch: 7


901it [02:04,  7.22it/s]
2it [00:00, 10.71it/s]

Training accuracy: 0.8624026775360107
Training loss: 0.9399082660675049
Validation accuracy: 0.8540056943893433
Validation loss: 0.9602863192558289
Epoch: 8


901it [02:18,  6.50it/s]
2it [00:00, 11.94it/s]

Training accuracy: 0.863092303276062
Training loss: 0.9371693134307861
Validation accuracy: 0.8532273173332214
Validation loss: 0.9553122520446777
Epoch: 9


901it [02:11,  6.87it/s]


Training accuracy: 0.8607452511787415
Training loss: 0.9408138394355774
Validation accuracy: 0.842219352722168
Validation loss: 0.9612152576446533
Split: 4


2it [00:00, 11.23it/s]

Epoch: 0


901it [01:56,  7.75it/s]
2it [00:00, 11.19it/s]

Training accuracy: 0.5940934419631958
Training loss: 1.3903923034667969
Validation accuracy: 0.782120406627655
Validation loss: 1.1528294086456299
Epoch: 1


901it [02:04,  7.24it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8394771814346313
Training loss: 1.027078628540039
Validation accuracy: 0.8382720947265625
Validation loss: 1.036746859550476
Epoch: 2


901it [01:58,  7.62it/s]
0it [00:00, ?it/s]

Training accuracy: 0.864827573299408
Training loss: 0.9877192378044128
Validation accuracy: 0.8461666703224182
Validation loss: 1.0293521881103516
Epoch: 3


901it [01:57,  7.70it/s]
2it [00:00, 10.69it/s]

Training accuracy: 0.8746940493583679
Training loss: 0.9719800353050232
Validation accuracy: 0.8686273694038391
Validation loss: 0.9884239435195923
Epoch: 4


901it [01:56,  7.73it/s]
2it [00:00, 10.63it/s]

Training accuracy: 0.8888097405433655
Training loss: 0.9555898904800415
Validation accuracy: 0.8739645481109619
Validation loss: 0.9778128862380981
Epoch: 5


901it [02:02,  7.33it/s]
0it [00:00, ?it/s]

Training accuracy: 0.891468346118927
Training loss: 0.9507279992103577
Validation accuracy: 0.88213711977005
Validation loss: 0.9642571806907654
Epoch: 6


901it [02:02,  7.39it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8952947854995728
Training loss: 0.948898434638977
Validation accuracy: 0.884249746799469
Validation loss: 0.9675635695457458
Epoch: 7


901it [01:57,  7.66it/s]
2it [00:00, 11.22it/s]

Training accuracy: 0.8912346959114075
Training loss: 0.9487499594688416
Validation accuracy: 0.8835269808769226
Validation loss: 0.9609959125518799
Epoch: 8


901it [01:56,  7.71it/s]
0it [00:00, ?it/s]

Training accuracy: 0.8933815360069275
Training loss: 0.9458463788032532
Validation accuracy: 0.8795797228813171
Validation loss: 0.968978762626648
Epoch: 9


901it [01:57,  7.65it/s]


Training accuracy: 0.8939710259437561
Training loss: 0.9431775808334351
Validation accuracy: 0.8815255165100098
Validation loss: 0.9644759893417358


In [11]:
print("CV Accuracy: {}".format(np.mean(cv_loss_val)))
print("CV Loss: {}".format(np.mean(cv_accuracy_val)))

CV Accuracy: 0.9641767740249634
CV Loss: 0.8712984919548035


In [16]:
# Saved Model, DO NOT RERUN
# save_dir = "../models/"
# model_name = "10_class_1_layerconv_89%"
# saver = tf.train.Saver()
# saver.save(sess, save_dir+model_name)

In [17]:
# restore model
saver = tf.train.Saver()
saver.restore(sess, "../models/10_class_1_layerconv_89%")

INFO:tensorflow:Restoring parameters from ../models/10_class_1_layerconv_89%


In [18]:
sess.run([acc, loss], feed_dict={train_inputs: X[:1000], train_labels: y[:1000]})

[0.89999998, 0.9561885]

In [19]:
# Extract the embeddings and save them

[char_embeddings] = sess.run([embeddings])

# save these to a file
np.save("../outputs/10_user_Nov11_embeddings.npy", char_embeddings)