In [2]:
import pandas as pd
from gensim.models.fasttext import FastText

Reading in training data


In [3]:
df = pd.read_csv(r'finnum\train.csv')
df.head(10)

Unnamed: 0,id,idx,tweet,category,subcategory,target_num
0,98221616,4976,$ARNA APD334 for Amyotrophic Lateral Sclerosis...,Product Number,Product Number,334.0
1,82321187,9839,"$OCLR Noob investor that i am, put a 7.38 stop...",Monetary,stop loss,7.38
2,103328840,1455,$ES_F $SPY Bias-2 bearish and the DLT-1 DRR ar...,Product Number,Product Number,1.0
3,104840294,1111,$TMUS its acquisition of Layer3 TV The purchas...,Product Number,Product Number,5.0
4,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Percentage,relative,14.0
5,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Quantity,Quantity,4.0
6,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Monetary,forecast,5.0
7,94249158,1372,$SEED L2 Capital deal is real savvy. It takes ...,Temporal,date,33.0
8,100979260,505,$BTE $BTE.CA $MEG.CA $CPG $CPG.CA $CJ.CA - 4th...,Temporal,date,4.0
9,100775772,1210,$WRN My fav $WRN pattern on my watchlist for 1...,Temporal,date,11.0


Make new column for encoding categories as numbers

In [40]:
df['cat_num'] = df['category'].astype('category').cat.codes

Loading in fastText model

In [5]:
model = FastText.load('fastText1')

Preprocessing tweets by lowercasing

In [41]:
df['lower'] = [x.lower() for x in df.tweet]

In [42]:
df['lower'].head()

0    $arna apd334 for amyotrophic lateral sclerosis...
1    $oclr noob investor that i am, put a 7.38 stop...
2    $es_f $spy bias-2 bearish and the dlt-1 drr ar...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up 14%  4 time avg vol. ...
Name: lower, dtype: object

Replacing target with <num\>

In [43]:
import re
def flagNum(x):
    text = x.lower
    outNum = str(x.target_num)
    text_out = re.sub(r'(?<=\D)'+outNum+'(?=\D)', ' <num> ', text)
    #text_out = text.replace('\D('+outNum+')\D', ' <num> ')
    return text_out

Applying to training, making this into a new column

In [44]:
df['mod'] = df.apply(lambda x: flagNum(x), axis = 1)

In [45]:
df['mod'].head()

0    $arna apd <num>  for amyotrophic lateral scler...
1    $oclr noob investor that i am, put a  <num>  s...
2    $es_f $spy bias-2 bearish and the dlt- <num>  ...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up  <num> %  4 time avg ...
Name: mod, dtype: object

In [46]:
df.lower.head()

0    $arna apd334 for amyotrophic lateral sclerosis...
1    $oclr noob investor that i am, put a 7.38 stop...
2    $es_f $spy bias-2 bearish and the dlt-1 drr ar...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up 14%  4 time avg vol. ...
Name: lower, dtype: object

Filtering out numbers and words less than 3 characters long

In [47]:
import string
def textPuncandNum(text):
    table = str.maketrans('', '', string.punctuation)
    text = re.sub(r'[0-9]+', '', text)
    text = text.split()
    text = [word for word in text if len(word.translate(table))>2]
    return ' '.join(text)
stripped = [textPuncandNum(text) for text in df['mod']]

Creating list of unique words from this processed text, excluding <num\>

In [48]:
low = list(stripped)
low = ' '.join(low)
low = list(set(low.split()))
low.remove('<num>')
len(low)

13107

In [49]:
low

['popped',
 'light',
 'okay',
 '@thenewsguy',
 'sessions.',
 'micron',
 'exhaust',
 '$intc',
 'controls',
 'falls',
 'ending',
 '$trxc',
 'horizontals',
 '$gluu,',
 'responded',
 'buyer@',
 'increase',
 'res',
 '$mdr',
 'fib',
 '&quot;pullback&quot;',
 '$cbl',
 'hurt-but',
 'anywhere',
 '$mro',
 'vix',
 'loss',
 '(especially',
 'accumulation.',
 'ridge',
 '{weekly}',
 '$ge--&gt;&#;s',
 'jagx',
 '$acst$acst',
 '$czr',
 'science',
 '$gene',
 'shareholder',
 'opinion',
 '$kite',
 'rising',
 '-year-low',
 'millions',
 'maturities',
 'https://www.fda.gov/newsevents/newsroom/pressannouncements/ucm.htm',
 '$hsgx',
 'total?',
 'autonomous',
 'coming?',
 'char',
 'gets.',
 'patience...',
 'hackers',
 'expectation.',
 '$dgly',
 'buying,',
 'achieve',
 'forecasts',
 'shorts,bearish',
 '$surry',
 'back?',
 'bitcoin-',
 'lower..',
 'first-in-human',
 'storage',
 '$ebio.',
 'assistance,',
 '@hardassets',
 'haven&#;t',
 'september.might',
 'cleared',
 '$adx',
 'savvy.',
 'disconnected',
 'hike,',
 '$

Using model to get embeddingss for these words

In [50]:
embed = model.wv[low]

In [51]:
embed.shape

(13107, 100)

Indexing these unqique words in a dictionary 

In [52]:
label_dict = {k: v for v, k in enumerate(low)}
label_dict['<num>'] = len(label_dict) 

Adding <num\> back in 

In [53]:
label_dict['<num>']

13107

Adding filler word to keep all tweets the same length. Then replacing all words with their dictionary equivalent. This is for tenssorflows matrix creation

In [54]:
bufferIndex = len(label_dict) 

In [55]:
import numpy as np
modifiedText = [[label_dict[word] for word in text.split()]for text in stripped]
maxLen = max(map(len, modifiedText))
for item in modifiedText:                # for each item in the list
    while len(item) < maxLen:            # while the item length is smaller than maxLen
        item.append(bufferIndex) 
numpyInp = np.asarray(modifiedText)

In [56]:
numpyInp

array([[10826,  9214, 13107, ..., 13108, 13108, 13108],
       [12526,  5153,  3347, ..., 13108, 13108, 13108],
       [ 1416, 12376,  4108, ..., 13108, 13108, 13108],
       ...,
       [  849, 10790,  4043, ..., 13108, 13108, 13108],
       [ 7649,  9443,  4842, ..., 13108, 13108, 13108],
       [13107,  7649,  9443, ..., 13108, 13108, 13108]])

Adding in unique embeddings for <num\> and filler

In [57]:
embed = np.vstack((embed, np.zeros(100)+20, np.zeros(100)+25))

In [58]:
embed.shape

(13109, 100)

## Implementing Convolutional Layer

In [59]:
import tensorflow as tf

In [60]:
tf.reset_default_graph()

Creating the embedding matrix for any input


In [61]:
batch_size = 16

In [79]:
train = (numpyInp, df['cat_num'])

# create training Dataset and batch it
train_data = tf.data.Dataset.from_tensor_slices(train)
train_data = train_data.shuffle(10000) # if you want to shuffle your data
train_data = train_data.batch(batch_size)

# create one iterator and initialize it with different datasets
iterator = tf.data.Iterator.from_structure(train_data.output_types, 
                                           train_data.output_shapes)
txt, label = iterator.get_next()

train_init = iterator.make_initializer(train_data)

In [77]:
train_inputs = tf.placeholder(tf.int64, shape=[batch_size])
train_labels = tf.placeholder(tf.int64, shape=[batch_size, 1])

In [80]:
embedding = tf.nn.embedding_lookup(embed, txt, partition_strategy='mod', name=None)
embedded_chars_expanded = tf.expand_dims(embedding, -1)

What does enumerate do?


In [70]:
filter_sizes = [0,3,4]
for i in enumerate(filter_sizes):
    print(i)

(0, 0)
(1, 3)
(2, 4)


Failed attempt using predefined filter

In [71]:
'''WINDOW_SIZE = 100
STRIDE = int(WINDOW_SIZE/2)
#embedding = tf.reshape(embedding, [53,1,26,100])
#conv = tf.layers.conv2d(embedding, 2, [2,WINDOW_SIZE], 
#               strides=1, padding='SAME') 
filter1 = np.array([1,2,100]).astype(np.float64)
conv = tf.nn.conv2d(embedded_chars_expanded, [1,2,100,1], strides = [1,2,2,1], padding = 'SAME')
conv = tf.nn.relu(conv)   
words = tf.squeeze(conv, [2]) '''

"WINDOW_SIZE = 100\nSTRIDE = int(WINDOW_SIZE/2)\n#embedding = tf.reshape(embedding, [53,1,26,100])\n#conv = tf.layers.conv2d(embedding, 2, [2,WINDOW_SIZE], \n#               strides=1, padding='SAME') \nfilter1 = np.array([1,2,100]).astype(np.float64)\nconv = tf.nn.conv2d(embedded_chars_expanded, [1,2,100,1], strides = [1,2,2,1], padding = 'SAME')\nconv = tf.nn.relu(conv)   \nwords = tf.squeeze(conv, [2]) "

Implementing convolution

In [88]:
filter_size = 2
embedding_size = 100
num_filters = 2
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv = tf.nn.conv2d(
    embedded_chars_expanded,
    tf.cast(W,tf.float64),
    strides=[1, 1, 1, 1],
    padding='VALID',
    name='conv')
conv = tf.nn.relu(conv)   
words = tf.squeeze(conv, [2])

In [96]:
def flatten(t):
    t = tf.reshape(t, [1, -1])
    t = tf.squeeze(t)
    return t

In [97]:
flattened = flatten(conv)

Add a feedforward layer

In [89]:
words

<tf.Tensor 'Squeeze_3:0' shape=(?, 25, 2) dtype=float64>

In [90]:
conv

<tf.Tensor 'Relu_3:0' shape=(?, 25, 1, 2) dtype=float64>

In [104]:
flattened

<tf.Tensor 'Squeeze_4:0' shape=<unknown> dtype=float64>