In [5]:
import pandas as pd
from gensim.models.fasttext import FastText

Reading in training data


In [6]:
df = pd.read_csv(r'finnum\train.csv')
df.head(10)

Unnamed: 0,id,idx,tweet,category,subcategory,target_num
0,98221616,4976,$ARNA APD334 for Amyotrophic Lateral Sclerosis...,Product Number,Product Number,334.0
1,82321187,9839,"$OCLR Noob investor that i am, put a 7.38 stop...",Monetary,stop loss,7.38
2,103328840,1455,$ES_F $SPY Bias-2 bearish and the DLT-1 DRR ar...,Product Number,Product Number,1.0
3,104840294,1111,$TMUS its acquisition of Layer3 TV The purchas...,Product Number,Product Number,5.0
4,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Percentage,relative,14.0
5,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Quantity,Quantity,4.0
6,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Monetary,forecast,5.0
7,94249158,1372,$SEED L2 Capital deal is real savvy. It takes ...,Temporal,date,33.0
8,100979260,505,$BTE $BTE.CA $MEG.CA $CPG $CPG.CA $CJ.CA - 4th...,Temporal,date,4.0
9,100775772,1210,$WRN My fav $WRN pattern on my watchlist for 1...,Temporal,date,11.0


Make new column for encoding categories as numbers

In [7]:
df['cat_num'] = df['category'].astype('category').cat.codes

Loading in fastText model

In [8]:
model = FastText.load('fastText1')

Preprocessing tweets by lowercasing

In [9]:
df['lower'] = [x.lower() for x in df.tweet]

In [10]:
df['lower'].head()

0    $arna apd334 for amyotrophic lateral sclerosis...
1    $oclr noob investor that i am, put a 7.38 stop...
2    $es_f $spy bias-2 bearish and the dlt-1 drr ar...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up 14%  4 time avg vol. ...
Name: lower, dtype: object

Replacing target with <num\>

In [11]:
import re
def flagNum(x):
    text = x.lower
    outNum = str(x.target_num)
    text_out = re.sub(r'(?<=\D)'+outNum+'(?=\D)', ' <num> ', text)
    #text_out = text.replace('\D('+outNum+')\D', ' <num> ')
    return text_out

Applying to training, making this into a new column

In [12]:
df['mod'] = df.apply(lambda x: flagNum(x), axis = 1)

In [13]:
df['mod'].head()

0    $arna apd <num>  for amyotrophic lateral scler...
1    $oclr noob investor that i am, put a  <num>  s...
2    $es_f $spy bias-2 bearish and the dlt- <num>  ...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up  <num> %  4 time avg ...
Name: mod, dtype: object

In [14]:
df.lower.head()

0    $arna apd334 for amyotrophic lateral sclerosis...
1    $oclr noob investor that i am, put a 7.38 stop...
2    $es_f $spy bias-2 bearish and the dlt-1 drr ar...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up 14%  4 time avg vol. ...
Name: lower, dtype: object

Filtering out numbers and words less than 3 characters long

In [15]:
import string
def textPuncandNum(text):
    table = str.maketrans('', '', string.punctuation)
    text = re.sub(r'[0-9]+', '', text)
    text = text.split()
    text = [word for word in text if len(word.translate(table))>2]
    return ' '.join(text)
stripped = [textPuncandNum(text) for text in df['mod']]

Creating list of unique words from this processed text, excluding <num\>

In [16]:
low = list(stripped)
low = ' '.join(low)
low = list(set(low.split()))
low.remove('<num>')
len(low)

13107

In [17]:
low

['correction:',
 'robotic',
 'lmao!',
 'expansion',
 'uptake',
 'held.',
 'hit.',
 'why.',
 '$bur',
 'https://finance.yahoo.com/news/credit-suisse-ag-announces-reverse-.html',
 'cryptocurrency',
 'c-suite',
 'slv',
 'you?',
 'http://stks.co/tbnu',
 '$nxtd',
 '$snss',
 'erinn...looks',
 'japan.',
 'wasn&#;t',
 'metrics',
 'days...',
 'delux,',
 '$tfm',
 '$gpor',
 '$amrs-',
 'gtav',
 '@tfnn',
 'cpu',
 'lunch.',
 'doubted',
 '$cmm.v',
 '$omgbtc',
 '$sgoc',
 '$exk',
 'mainstream',
 '$viab',
 'capita',
 'instant',
 'cheaper',
 'needham.',
 'traffic',
 'would',
 'goiing',
 'gdpr',
 '$fmsa',
 'establish',
 'agen',
 'agrawals',
 'price.',
 'quickly?',
 '$jwn.',
 'pricey',
 'limits',
 'confident.',
 'launch.',
 'amzn.',
 'double+',
 'https://www.thecontraaccountant.com/single-post////have-you-forgotten-about-the-',
 'fool.',
 'reforms.',
 'sears',
 'cycle',
 'reach',
 'excellent',
 'tomorrow....capisce?',
 '$aprn',
 '&quot;office',
 'achieved',
 'something!',
 'cap.',
 'posts:&#;',
 'laughing',

Using model to get embeddingss for these words

In [18]:
embed = model.wv[low]

In [19]:
embed.shape

(13107, 100)

Indexing these unqique words in a dictionary 

In [20]:
label_dict = {k: v for v, k in enumerate(low)}
label_dict['<num>'] = len(label_dict) 

Adding <num\> back in 

In [21]:
label_dict['<num>']

13107

Adding filler word to keep all tweets the same length. Then replacing all words with their dictionary equivalent. This is for tenssorflows matrix creation

In [22]:
bufferIndex = len(label_dict) 

In [45]:
import numpy as np
modifiedText = [[label_dict[word] for word in text.split()]for text in stripped]
maxLen = max(map(len, modifiedText))
for item in modifiedText:                # for each item in the list
    while len(item) < maxLen:            # while the item length is smaller than maxLen
        item.append(bufferIndex) 
numpyInp = np.asarray(modifiedText)

In [46]:
numpyInp

array([[13095,  8663, 13107, ..., 13108, 13108, 13108],
       [11365,   729, 10143, ..., 13108, 13108, 13108],
       [ 3805,  2562,   191, ..., 13108, 13108, 13108],
       ...,
       [ 3989, 12604,  9343, ..., 13108, 13108, 13108],
       [12534,  5903,  1038, ..., 13108, 13108, 13108],
       [13107, 12534,  5903, ..., 13108, 13108, 13108]])

Adding in unique embeddings for <num\> and filler

In [47]:
embed = np.vstack((embed, np.zeros(100)+20, np.zeros(100)+25))

In [48]:
embed.shape

(13111, 100)

## Implementing Convolutional Layer

In [49]:
import tensorflow as tf
batch_size = 16

In [50]:
tf.reset_default_graph()

Creating the embedding matrix for any input


In [51]:
labels = pd.get_dummies(df['cat_num'])

Batching and creating iterators

In [52]:
train = (numpyInp, labels)

# create training Dataset and batch it
train_data = tf.data.Dataset.from_tensor_slices(train)
train_data = train_data.shuffle(10000) # if you want to shuffle your data
train_data = train_data.batch(batch_size)

# create one iterator and initialize it with different datasets
iterator = tf.data.Iterator.from_structure(train_data.output_types, 
                                           train_data.output_shapes)
txt, label = iterator.get_next()

train_init = iterator.make_initializer(train_data)

In [53]:
embedding = tf.nn.embedding_lookup(embed, txt, partition_strategy='mod', name=None)
embedded_chars_expanded = tf.expand_dims(embedding, -1)

What does enumerate do?


In [54]:
filter_sizes = [0,3,4]
for i in enumerate(filter_sizes):
    print(i)

(0, 0)
(1, 3)
(2, 4)


Failed attempt using predefined filter
Update: Works now, not using tho

In [55]:
'''
WINDOW_SIZE = 100
STRIDE = int(WINDOW_SIZE/2)
#embedding2 = tf.expand_dims(embedding, axis = 1)
conv = tf.layers.conv2d(embedded_chars_expanded, 2, [2,WINDOW_SIZE], 
               strides=1, padding='SAME') 
conv = tf.nn.relu(conv)   
words = flatten(conv)
'''

"\nWINDOW_SIZE = 100\nSTRIDE = int(WINDOW_SIZE/2)\n#embedding2 = tf.expand_dims(embedding, axis = 1)\nconv = tf.layers.conv2d(embedded_chars_expanded, 2, [2,WINDOW_SIZE], \n               strides=1, padding='SAME') \nconv = tf.nn.relu(conv)   \nwords = flatten(conv)\n"

In [56]:
embedding

<tf.Tensor 'embedding_lookup/Identity:0' shape=(?, 26, 100) dtype=float64>

In [57]:
embedded_chars_expanded

<tf.Tensor 'ExpandDims:0' shape=(?, 26, 100, 1) dtype=float64>

Implementing looped convolution

In [58]:
pooled_outputs = []
filter_sizes = [2, 3, 5]
embedding_size = 100
num_filters = 2
max_length = 26
for filter_size in filter_sizes:
    filter_shape = [filter_size, embedding_size, 1, num_filters]
    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
    b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name = 'b')
    conv = tf.nn.conv2d(
        embedded_chars_expanded,
        tf.cast(W,tf.float64),
        strides=[1, 1, 1, 1],
        padding='VALID',
        name='conv')
    relu = tf.nn.relu(tf.nn.bias_add(conv, tf.cast(b,tf.float64)), name="relu")
    pooled = tf.nn.max_pool(
        relu,
        ksize=[1, max_length - filter_size + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',
        name="pool")
    pooled_outputs.append(pooled)

In [59]:
pooled_outputs

[<tf.Tensor 'pool:0' shape=(?, 1, 1, 2) dtype=float64>,
 <tf.Tensor 'pool_1:0' shape=(?, 1, 1, 2) dtype=float64>,
 <tf.Tensor 'pool_2:0' shape=(?, 1, 1, 2) dtype=float64>]

Combining separate convolutional layers into 1 feed forward input

In [60]:
num_filters_total = num_filters * len(filter_sizes)
combined = tf.concat(pooled_outputs, 3)
combined_flat = tf.reshape(combined, [-1, num_filters_total])

In [61]:
combined_flat

<tf.Tensor 'Reshape:0' shape=(?, 6) dtype=float64>

Adding dense layers

In [62]:
conn = tf.layers.dense(combined_flat, 100, activation = 'relu')
conn2 = tf.layers.dense(conn, len(set(df.cat_num)))

In [63]:
conn

<tf.Tensor 'dense/Relu:0' shape=(?, 100) dtype=float64>

Implementing cross entropy, loss, and optimization

In [64]:
n_epochs = 50
entropy = tf.nn.softmax_cross_entropy_with_logits(labels = label, logits = conn2)
loss = tf.reduce_mean(entropy)
optimizer = tf.train.AdamOptimizer(learning_rate = 0.001).minimize(loss)

Prediction setup

In [65]:
preds = tf.nn.softmax(conn2)
correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(label, 1))
accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))

Running Neural Net

In [67]:
with tf.Session() as sess:
    #start_time = time.time()
    sess.run(tf.global_variables_initializer())
    # train the model n_epochs times

    for i in range(n_epochs): 
        
        sess.run(train_init)# drawing samples from train_data
        total_loss = 0
        total_right = 0
        n_batches = 0
        totalright = 0
        try:
            while True:
                #summary,acc,_, l = sess.run([summary_op,accuracy,optimizer, loss]) #use with scalar summary
                acc,_, l = sess.run([accuracy, optimizer, loss])                
                total_loss += l
                total_right += acc
                n_batches += 1
        except tf.errors.OutOfRangeError:
            pass

        print('Average loss epoch {0}: {1}'.format(i, total_loss/n_batches))
        print('Accuracy {0}: {1}'.format(i, total_right/n_batches))    
    prediction = sess.run(preds, feed_dict={txt: numpyInp})
    prediction = np.asarray(prediction)


Average loss epoch 0: 1.5219268212773858
Average accuracy 0: 0.40580357142857143
Average loss epoch 1: 1.4015838796841649
Average accuracy 1: 0.46577380952380953
Average loss epoch 2: 1.3362521960858027
Average accuracy 2: 0.4924107142857143
Average loss epoch 3: 1.2936875659424627
Average accuracy 3: 0.5086309523809524
Average loss epoch 4: 1.2510061093359401
Average accuracy 4: 0.5215773809523809
Average loss epoch 5: 1.2153263375953338
Average accuracy 5: 0.5367559523809524
Average loss epoch 6: 1.1814804627139763
Average accuracy 6: 0.5488095238095239
Average loss epoch 7: 1.1550278235643308
Average accuracy 7: 0.5629464285714286
Average loss epoch 8: 1.1312451021707328
Average accuracy 8: 0.5669642857142857
Average loss epoch 9: 1.1153681253860992
Average accuracy 9: 0.578422619047619
Average loss epoch 10: 1.0893853429656508
Average accuracy 10: 0.58125
Average loss epoch 11: 1.0798366108340913
Average accuracy 11: 0.5889880952380953
Average loss epoch 12: 1.068692325122413
Avera

Calculating accuracy for train

In [68]:
out_preds = np.equal(np.argmax(prediction, 1), labels.idxmax(axis = 1))
acc = np.mean(out_preds)

In [69]:
accuracy

<tf.Tensor 'Mean_1:0' shape=() dtype=float32>

Fooling around with numpy

In [None]:
f = np.array([[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3]], [[1,2,3], [1,2,3]]])
#f = np.array([[1,2,3], [1,2,3]])
#print(f)
g = np.expand_dims(f, axis = 1)
g = np.reshape(f, [-1,2])
print(g)
print(g.shape)
print(np.squeeze(g))