In [28]:
import tensorflow as tf
import numpy as np
import re
import nltk
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
from nltk.stem.porter import *

INPUT_DIMS = (55, 200, 1)
REGION_SIZES = [2,3,4]
FILTERS_PER_REGION = 2



In [29]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

def tidy_tweet(dataset, min_len=3):
    dataset['tidy_tweet'] = np.vectorize(remove_pattern)(dataset['Tweet'], "@[\w]*")
    dataset.tidy_tweet = dataset.tidy_tweet.str.replace("[^a-zA-Z#]", " ")
    dataset.tidy_tweet = dataset.tidy_tweet.apply(lambda x: ' '.join([w for w in x.split() if len(w) > min_len]))
    tokenized_tweet = dataset.tidy_tweet.apply(lambda x: x.split())

    stemmer = PorterStemmer()
    tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
    tokenized_tweet.head()
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    dataset['tidy_tweet'] = tokenized_tweet
    return dataset

def tokenized_tweet(dataframe):
    return dataframe['tidy_tweet'].apply(lambda x: x.split())

def train_vectorizer(tokenized_tweet): 
    model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size = 200, #desired number of features, 200 seems to be a common width, no idea why
            window = 5, #context window size
            min_count =2, #ignores all words with total freq lower than 2
            sg = 1, #encoding for skip-gram model
            hs = 0,
            negative = 10, #for negative sampling
            workers = 2, #no. of cores
            seed = 34
    )

    model_w2v.train(tokenized_tweet, total_examples = len(train['tidy_tweet']), epochs=20)
    return model_w2v

def apply_model(model, token, dim=200):
    try:
        return model[token]
    except(KeyError):
        return np.zeros(dim)



In [30]:
def vectorize_inputs(tokenized_tweet, model_w2v):
    inputs = np.zeros((len(tokenized_tweet), 55, 200, 1))
    for tweet_ind in range(len(tokenized_tweet)):
        feature_map = inputs[tweet_ind]
        for word_ind, word in enumerate(tokenized_tweet[tweet_ind]):
            if(word_ind<55):
                feature_map[word_ind] = apply_model(model_w2v, word).reshape(200,1)
    return inputs

In [31]:
names = ['TweetID', 'Sentiment', 'Tweet']

train = pd.read_csv('../Datasets/dataset/train/aggregate.csv', names=names)


train = tidy_tweet(train)
tokenized_tweet = tokenized_tweet(train)
model_w2v = train_vectorizer(tokenized_tweet)

x_train = vectorize_inputs(tokenized_tweet, model_w2v)

y_train = []
for sentiment in train['Sentiment']:
    if sentiment == 'positive':
        y_train.append(1.0)
    elif sentiment == 'negative':
        y_train.append(0.0)
    else:
        y_train.append(0.5)
y_train = np.array(y_train)



https://stackoverflow.com/questions/43151775/how-to-have-parallel-convolutional-layers-in-keras

In [33]:
x_train.shape

(50132, 55, 200, 1)

In [34]:
input_layer = tf.keras.Input(shape=INPUT_DIMS)

parallel_layers = []
for size in REGION_SIZES:
    parallel_layer = tf.keras.layers.Conv2D(FILTERS_PER_REGION, (INPUT_DIMS[0]), activation='relu')(input_layer)
    parallel_layer = tf.keras.layers.MaxPool2D(pool_size=(1, size))(parallel_layer)
    parallel_layers.append(parallel_layer)
merged = tf.keras.layers.concatenate(parallel_layers, axis=2)
flatten = tf.keras.layers.Flatten()(merged)
hidden = tf.keras.layers.Dense(4, activation='relu')(flatten)
output = tf.keras.layers.Dense(3, activation='softmax')(hidden)

model = tf.keras.Model(input_layer, output)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(x_train,
                   y_train,
                   batch_size=32,
                   epochs=100,
                   validation_split=0.1,
                   )

Epoch 1/100
 233/1410 [===>..........................] - ETA: 6:59 - loss: 0.8210 - accuracy: 0.1589

KeyboardInterrupt: 