Import libraries

In [1]:
import pandas as pd
from gensim.models.fasttext import FastText
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


Read in data

In [2]:
df = pd.read_csv('finnum/train.csv')
x = df.index
#df2 = pd.read_csv('finnum/dev.csv')
#df = pd.concat([df1,df2], ignore_index = True)
#df.head(10)
dfval = pd.read_csv('finnum/dev.csv')

Add column for coded category

In [3]:
df['cat_num'] = df['category'].astype('category').cat.codes
dfval['cat_num'] = dfval['category'].astype('category').cat.codes

Load FastText model previously generated

In [4]:
model = FastText.load('fastText1')

Lowercase tweets

In [5]:
df['lower'] = [x.lower() for x in df.tweet]
dfval['lower'] = [x.lower() for x in dfval.tweet]

Define function for finding and replacing numeral with flag

In [6]:
import re
def flagNum(x):
    text = x.lower
    outNum = str(x.target_num)
    text_out = re.sub(r'(?<=\D)'+outNum+'(?=\D)', ' <num> ', text)
    #text_out = text.replace('\D('+outNum+')\D', ' <num> ')
    return text_out

Apply the function

In [7]:
df['mod'] = df.apply(lambda x: flagNum(x), axis = 1)
dfval['mod'] = dfval.apply(lambda x: flagNum(x), axis = 1)

Define function for cleaning string data

In [8]:
import string
def textPuncandNum(text):
    table = str.maketrans('', '', string.punctuation)
    text = re.sub(r'[0-9]+', '', text)
    text = text.split()
    text = [word for word in text if len(word.translate(table))>2]
    return ' '.join(text)
stripped = [textPuncandNum(text) for text in df['mod']]

Define functions for computing flattened version or computing average

In [9]:
def computeflat(tweet):
    splits = tweet.split()
    ysplit = model.wv[splits]
    flat = ysplit.flatten()
    tot = len(flat)
    N = 2600-tot

    flat = np.pad(flat, (0, N), 'constant')
    return flat

def computeavg(tweet):
    splits = tweet.split()
    ysplit = model.wv[splits]
    flat = ysplit.mean(axis = 0)
    return flat

In [10]:
computeavg(df['mod'].iloc[0]).shape

(100,)

Add column for filtered text then apply function for computing average

In [11]:
df['filt'] = df['mod'].apply(textPuncandNum)
df['avg'] = df['filt'].apply(computeavg)
dfval['filt'] = dfval['mod'].apply(textPuncandNum)
dfval['avg'] = dfval['filt'].apply(computeavg)

Create dataframes for just the computed average

In [12]:
df3 = pd.DataFrame(df['avg'].values.tolist())
df3val = pd.DataFrame(dfval['avg'].values.tolist())

In [13]:
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.42697,-0.572228,0.692746,-1.641321,-0.938701,0.415754,2.714524,0.326606,-0.850521,0.246746,...,-0.291254,0.788507,1.797049,-0.007852,0.146058,-1.785609,0.574501,1.66089,-0.313506,0.427466
1,0.905482,0.333876,-0.775448,-0.885197,-2.007056,-0.500664,2.951405,0.365051,0.122484,-0.637362,...,0.683859,1.370171,0.985893,0.291526,-0.771181,-1.796481,-0.637774,0.062626,0.337553,0.366458
2,0.434677,-0.017279,-0.079473,-1.012586,-1.699872,-0.331523,1.768947,-0.259095,-0.43722,-0.153384,...,-0.018965,1.480115,0.914381,0.517442,-0.369128,-1.541616,-0.680517,0.952408,-0.247289,0.294054
3,0.440035,-0.59382,1.425271,-1.835119,-0.634127,-0.415725,2.999693,0.768278,-0.843966,0.560807,...,0.005934,1.337318,2.114948,0.650907,-0.14215,-1.774773,1.162506,1.391888,0.198146,-0.065348
4,0.738987,-0.369429,-1.39298,-1.412612,-1.317773,-0.896656,1.892733,-0.217716,-0.441089,0.076812,...,0.173795,0.899918,1.195099,1.098592,-0.471012,-0.202368,0.365563,-0.034328,-0.40191,0.579763


Create series for keeping track of (coded) category labels 

In [14]:
yfull = df['category']
yfull = pd.get_dummies(yfull)
yfullval = dfval['category']
yfullval = pd.get_dummies(yfullval)
yfull.head()

Unnamed: 0,Indicator,Monetary,Option,Percentage,Product Number,Quantity,Temporal
0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0


Convert to correct type (float32)

In [15]:
yfullnp = yfull.values
yfullnp = yfullnp.astype('float32')
yfullvalnp = yfullval.values
yfullvalnp = yfullvalnp.astype('float32')
print(type(yfullnp))

<class 'numpy.ndarray'>


In [16]:
df3np = df3.values
df3np = df3np.astype('float32')
df3valnp = df3val.values
df3valnp = df3valnp.astype('float32')

In [17]:
data = (df3np, yfullnp)
dataval = (df3valnp, yfullvalnp)

Batching and initialization

In [18]:
batch_size = 64
train_data = tf.data.Dataset.from_tensor_slices(data)
train_data = train_data.batch(batch_size)
iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes)
feature, label = iterator.get_next()
train_init = iterator.make_initializer(train_data)	# initializer for train_data

val_data = tf.data.Dataset.from_tensor_slices(dataval)
val_data = val_data.batch(batch_size)
val_init = iterator.make_initializer(val_data)


Define weights and biases variables

In [19]:
#variables for weights and biases
inp_size = int(feature.get_shape()[1])
w_h = tf.get_variable('weight_h', initializer = tf.random_normal([inp_size,600], mean = 0.0, stddev = .01),dtype = tf.float32)
b_h = tf.get_variable('bias_h', initializer = tf.constant(0.0, shape = [1,600]), dtype = tf.float32)

w2 = tf.get_variable('weight2', initializer = tf.random_normal([600,100], mean = 0.0, stddev = .01),dtype = tf.float32)
b2 = tf.get_variable('bias2', initializer = tf.constant(0.0, shape = [1,100]), dtype = tf.float32)

w3 = tf.get_variable('weight3', initializer = tf.random_normal([100,600], mean = 0.0, stddev = .01),dtype = tf.float32)
b3 = tf.get_variable('bias3', initializer = tf.constant(0.0, shape = [1,600]), dtype = tf.float32)

w = tf.get_variable('weight', initializer = tf.random_normal([600,7], mean = 0.0, stddev = .01),dtype = tf.float32)
b = tf.get_variable('bias', initializer = tf.constant(0.0, shape = [1,7]), dtype = tf.float32)

Model definition

In [20]:
#define model
h2 = tf.nn.leaky_relu(tf.matmul(feature, w_h) + b_h)

drop1 = tf.nn.dropout(h2, keep_prob = .6)

h = tf.nn.leaky_relu(tf.matmul(drop1, w2) + b2)

drop2 = tf.nn.dropout(h, keep_prob = .6)

h0 = tf.nn.leaky_relu(tf.matmul(drop2, w3) + b3)

drop0 = tf.nn.dropout(h0, keep_prob = .6)

In [21]:
logits = tf.matmul(drop0,w) + b

In [22]:
entropy = tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=logits)

loss = tf.reduce_mean(entropy)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [23]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)
preds = tf.nn.softmax(logits)
correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(label, 1))
accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))

Run the model

In [24]:
n_epochs = 100

        
loss_summary = tf.summary.scalar('loss', loss)
accuracy_summary = tf.summary.scalar('accuracy', accuracy)

merged_summary_op = tf.summary.merge_all()

writer1 = tf.summary.FileWriter('./Pgraphs/train', graph = tf.get_default_graph())
writer2 = tf.summary.FileWriter('./Pgraphs/val', graph = tf.get_default_graph())


with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    # train the model n_epochs times
    for i in range(n_epochs): 
        	
        sess.run(train_init)	# drawing samples from train_data
        total_loss = 0
        n_batches = 0
        totalright = 0

        try:
            while True:
                #summary,acc,_, l = sess.run([summary_op,accuracy,optimizer, loss]) #use with scalar summary
                acc,_, l = sess.run([accuracy,optimizer, loss])

                totalright += acc
                
                total_loss += l
                n_batches += 1
        except tf.errors.OutOfRangeError:
            pass

        train_summary = sess.run(merged_summary_op, feed_dict = {feature:data[0], label:data[1]})
        val_summary = sess.run(merged_summary_op, feed_dict= {feature: dataval[0], label: dataval[1]})

        writer1.add_summary(train_summary, i)
        writer1.flush()
        writer2.add_summary(val_summary, i)
        writer2.flush()
        
        
        print('Average loss epoch {0}: {1}'.format(i, total_loss/n_batches))
        print('Average acc epoch {0}: {1}'.format(i, totalright/n_batches))
        
        sess.run(val_init)			# drawing samples from test_data
        total_correct_preds = 0
        totval_loss = 0
        n_batches = 0
        try:
            while True:
                accuracy_batch, valloss = sess.run([accuracy, loss])
                total_correct_preds += accuracy_batch
                
                totval_loss += valloss
                n_batches += 1
                
        except tf.errors.OutOfRangeError:
            pass

        
        print('Average val loss epoch {0}: {1}'.format(i, totval_loss/n_batches))
        print('Average val acc epoch {0}: {1}'.format(i, total_correct_preds/n_batches))

Average loss epoch 0: 1.683920551481701
Average acc epoch 0: 0.3511785714399247
Average val loss epoch 0: 1.5270078480243683
Average val acc epoch 0: 0.3583333343267441
Average loss epoch 1: 1.5062649102438064
Average acc epoch 1: 0.34966666670072644
Average val loss epoch 1: 1.5193987886110942
Average val acc epoch 1: 0.35755208134651184
Average loss epoch 2: 1.4964026394344512
Average acc epoch 2: 0.35672619058972316
Average val loss epoch 2: 1.5083454350630443
Average val acc epoch 2: 0.3507812519868215
Average loss epoch 3: 1.4834633872622536
Average acc epoch 3: 0.3540476191611517
Average val loss epoch 3: 1.4948620895544689
Average val acc epoch 3: 0.37447916467984516
Average loss epoch 4: 1.4723164217812674
Average acc epoch 4: 0.36048809516997565
Average val loss epoch 4: 1.4749723474184673
Average val acc epoch 4: 0.35364583134651184
Average loss epoch 5: 1.456106526511056
Average acc epoch 5: 0.3523035713604518
Average val loss epoch 5: 1.4708104232947032
Average val acc epoc

Average loss epoch 48: 1.1187939763069152
Average acc epoch 48: 0.5539285716556367
Average val loss epoch 48: 1.3055898646513622
Average val acc epoch 48: 0.47864583134651184
Average loss epoch 49: 1.1159899314244588
Average acc epoch 49: 0.5555654764175415
Average val loss epoch 49: 1.3130838970343273
Average val acc epoch 49: 0.4812499980131785
Average loss epoch 50: 1.115200192020053
Average acc epoch 50: 0.5565833335831052
Average val loss epoch 50: 1.3128591279188793
Average val acc epoch 50: 0.4752604166666667
Average loss epoch 51: 1.1069773378826322
Average acc epoch 51: 0.5620714284124828
Average val loss epoch 51: 1.3221977750460308
Average val acc epoch 51: 0.47838541865348816
Average loss epoch 52: 1.108038976646605
Average acc epoch 52: 0.5545892857369923
Average val loss epoch 52: 1.3120707670847576
Average val acc epoch 52: 0.4921875
Average loss epoch 53: 1.0989974566868372
Average acc epoch 53: 0.5608571427209037
Average val loss epoch 53: 1.3313762843608856
Average va

Average loss epoch 96: 0.9047724956557864
Average acc epoch 96: 0.6495654764629546
Average val loss epoch 96: 1.4555570284525554
Average val acc epoch 96: 0.4684895823399226
Average loss epoch 97: 0.8942488738468715
Average acc epoch 97: 0.6553095238549369
Average val loss epoch 97: 1.4680818319320679
Average val acc epoch 97: 0.46536458532015484
Average loss epoch 98: 0.8903773915200006
Average acc epoch 98: 0.653244047505515
Average val loss epoch 98: 1.4655911028385162
Average val acc epoch 98: 0.49010416865348816
Average loss epoch 99: 0.8855218660263788
Average acc epoch 99: 0.6565595240820021
Average val loss epoch 99: 1.5007044573624928
Average val acc epoch 99: 0.46822916467984516
