In [1]:
import pandas as pd
from gensim.models.fasttext import FastText

Reading in training data


In [35]:
df = pd.read_csv(r'finnum\train.csv')
df.head(10)

Unnamed: 0,id,idx,tweet,category,subcategory,target_num
0,98221616,4976,$ARNA APD334 for Amyotrophic Lateral Sclerosis...,Product Number,Product Number,334.0
1,82321187,9839,"$OCLR Noob investor that i am, put a 7.38 stop...",Monetary,stop loss,7.38
2,103328840,1455,$ES_F $SPY Bias-2 bearish and the DLT-1 DRR ar...,Product Number,Product Number,1.0
3,104840294,1111,$TMUS its acquisition of Layer3 TV The purchas...,Product Number,Product Number,5.0
4,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Percentage,relative,14.0
5,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Quantity,Quantity,4.0
6,69935467,2373,$TWTR ^Buy $WSTL 68c up 14% 4 time avg vol. ...,Monetary,forecast,5.0
7,94249158,1372,$SEED L2 Capital deal is real savvy. It takes ...,Temporal,date,33.0
8,100979260,505,$BTE $BTE.CA $MEG.CA $CPG $CPG.CA $CJ.CA - 4th...,Temporal,date,4.0
9,100775772,1210,$WRN My fav $WRN pattern on my watchlist for 1...,Temporal,date,11.0


Loading in fastText model

In [2]:
model = FastText.load('fastText1')

Preprocessing tweets by lowercasing

In [39]:
df['lower'] = [x.lower() for x in df.tweet]

In [54]:
df['lower'].head()

0    $arna apd334 for amyotrophic lateral sclerosis...
1    $oclr noob investor that i am, put a 7.38 stop...
2    $es_f $spy bias-2 bearish and the dlt-1 drr ar...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up 14%  4 time avg vol. ...
Name: lower, dtype: object

Replacing target with <num\>

In [217]:
import re
def flagNum(x):
    text = x.lower
    outNum = str(x.target_num)
    text_out = re.sub(r'(?<=\D)'+outNum+'(?=\D)', ' <num> ', text)
    #text_out = text.replace('\D('+outNum+')\D', ' <num> ')
    return text_out

Applying to training, making this into a new column

In [218]:
df['mod'] = df.apply(lambda x: flagNum(x), axis = 1)

In [380]:
df['mod'].head()

0    $arna apd <num>  for amyotrophic lateral scler...
1    $oclr noob investor that i am, put a  <num>  s...
2    $es_f $spy bias-2 bearish and the dlt- <num>  ...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up  <num> %  4 time avg ...
Name: mod, dtype: object

In [50]:
df.lower.head()

0    $arna apd334 for amyotrophic lateral sclerosis...
1    $oclr noob investor that i am, put a 7.38 stop...
2    $es_f $spy bias-2 bearish and the dlt-1 drr ar...
3    $tmus its acquisition of layer3 tv the purchas...
4    $twtr ^buy  $wstl 68c up 14%  4 time avg vol. ...
Name: lower, dtype: object

Filtering out numbers and words less than 3 characters long

In [113]:
import string
def textPuncandNum(text):
    table = str.maketrans('', '', string.punctuation)
    text = re.sub(r'[0-9]+', '', text)
    text = text.split()
    text = [word for word in text if len(word.translate(table))>2]
    return ' '.join(text)
stripped = [textPuncandNum(text) for text in df['mod']]

Creating list of unique words from this processed text, excluding <num\>

In [223]:
low = list(stripped)
low = ' '.join(low)
low = list(set(low.split()))
low.remove('<num>')
len(low)

13126

In [224]:
low

['gentlemen,',
 'closures',
 'backing',
 '(low',
 'pipe',
 'behavior',
 'ago..',
 'questions.',
 'pr/er',
 'lot',
 'gave',
 'char',
 'hurts,',
 'rush',
 '$mkc.',
 '(but',
 '%.....almost',
 'pre-earnings',
 'stop.',
 'wage',
 'gimme',
 'balance',
 'standby.over',
 'nhl,',
 'bottom.',
 '$once',
 'driving',
 'comml',
 '&quot;no',
 'day..hmm.',
 'employment,',
 'parade',
 'action',
 'nest,',
 'updated',
 'order?',
 '$rox',
 'sma.',
 'million-dollars.html',
 'upside',
 'brands/costs',
 'collectively',
 '@nautica',
 'steal!',
 '$covalbtc',
 'crossover',
 'snatched',
 'adding',
 'core',
 'fidelity',
 'pharmas',
 'down!!!',
 'proof',
 'neo',
 'yet-',
 'position?',
 'viral',
 'longer.',
 '$drio',
 'purchase!!',
 '$tops',
 'wells',
 'https://t.co/wfeoisai',
 'rating.',
 'direction',
 'value=$mm,',
 'guidance',
 'day,',
 'couple',
 'possible',
 '$cafd',
 'http://ibankcoin.com/raul////breaking-blog-silence-to-clarify-my-bullish-beliefs/',
 '$abio',
 'arriving',
 'thanksgiving.',
 '@april',
 '$idra

Using model to get embeddingss for these words

In [272]:
embed = model.wv[low]

In [273]:
embed.shape

(13126, 100)

Indexing these unqique words in a dictionary 

In [274]:
label_dict = {k: v for v, k in enumerate(low)}
label_dict['<num>'] = len(label_dict) 

Adding <num\> back in 

In [275]:
label_dict['<num>']

13126

Adding filler word to keep all tweets the same length. Then replacing all words with their dictionary equivalent. This is for tenssorflows matrix creation

In [276]:
bufferIndex = len(label_dict) 

In [277]:
import numpy as np
modifiedText = [[label_dict[word] for word in text.split()]for text in stripped]
maxLen = max(map(len, modifiedText))
for item in modifiedText:                # for each item in the list
    while len(item) < maxLen:            # while the item length is smaller than maxLen
        item.append(bufferIndex) 
numpyInp = np.asarray(modifiedText)

In [278]:
numpyInp

array([[ 4381, 13126, 12330, ..., 13127, 13127, 13127],
       [ 5569, 12353,  6595, ..., 13127, 13127, 13127],
       [ 5898,  7543,  6293, ..., 13127, 13127, 13127],
       ...,
       [ 3457,  7809, 10988, ..., 13127, 13127, 13127],
       [ 4890,  1276,  9428, ..., 13127, 13127, 13127],
       [13126,  4890,  1276, ..., 13127, 13127, 13127]])

Adding in unique embeddings for <num\> and filler

In [281]:
embed = np.vstack((embed, np.zeros(100)+20, np.zeros(100)+25))

In [282]:
embed.shape

(13128, 100)

## Implementing Convolutional Layer

In [257]:
import tensorflow as tf

In [371]:
tf.reset_default_graph()

Creating the embedding matrix for any input


In [372]:
embedding = tf.nn.embedding_lookup(embed, numpyInp[2:55], partition_strategy='mod', name=None)
embedded_chars_expanded = tf.expand_dims(embedding, -1)

In [366]:
with tf.Session() as sess:
    x = embedding.eval()
    b = embedded_chars_expanded.eval()

In [327]:
x.shape

(53, 26, 100)

What does enumerate do?


In [304]:
filter_sizes = [0,3,4]
for i in enumerate(filter_sizes):
    print(i)

(0, 0)
(1, 3)
(2, 4)


Failed attempt using predefined filter

In [370]:
'''WINDOW_SIZE = 100
STRIDE = int(WINDOW_SIZE/2)
#embedding = tf.reshape(embedding, [53,1,26,100])
#conv = tf.layers.conv2d(embedding, 2, [2,WINDOW_SIZE], 
#               strides=1, padding='SAME') 
filter1 = np.array([1,2,100]).astype(np.float64)
conv = tf.nn.conv2d(embedded_chars_expanded, [1,2,100,1], strides = [1,2,2,1], padding = 'SAME')
conv = tf.nn.relu(conv)   
words = tf.squeeze(conv, [2]) '''

ValueError: Shape must be rank 4 but is rank 1 for 'Conv2D' (op: 'Conv2D') with input shapes: [53,26,100,1], [4].

Implementing convolution

In [379]:
filter_size = 2 
embedding_size = 100
num_filters = 2
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
b = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv = tf.nn.conv2d(
    embedded_chars_expanded,
    tf.cast(W,tf.float64),
    strides=[1, 1, 1, 1],
    padding='VALID',
    name='conv')
conv = tf.nn.relu(conv)   
words = tf.squeeze(conv, [2])