In [167]:
import pandas as pd
import os
import numpy as np
from spellchecker import SpellChecker
import re

from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Bidirectional, \
                            Dropout, Embedding, Conv2D, MaxPool2D, Reshape, \
                            TimeDistributed, Activation, BatchNormalization, Input
from tensorflow.keras.optimizers import RMSprop, Adam

# TO RUN ON GPU, UNCOMMENT
# import tensorflow as tf
# config = tf.compat.v1.ConfigProto(device_count = {'GPU':2})
# sess = tf.compat.v1.Session(config=config)
# tf.compat.v1.keras.backend.set_session(sess)

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
# Get the data into a pandas dataframe
train_file_path = "/Users/yigitatay/Desktop/SentNLP/data/train.csv"
df = pd.read_csv(train_file_path)
test_file_path = "/Users/yigitatay/Desktop/SentNLP/data/test.csv"
df_test = pd.read_csv(test_file_path)

In [203]:
# separate labels and data
texts = df["text"]
selected_texts = df["selected_text"]
sentiments = df["sentiment"]
# lists to hold text, sentiment
train_list = []
train_sentiment_list = []
# a list to hold the labels
label_list = []
for text, data, label in zip(texts, sentiments, selected_texts):
    text = str(text).split()
    train_list.append(text)
    label = str(label).split()
    label_list.append(label)
    train_sentiment_list.append(data)
    
# separate labels and data
texts = df_test["text"]
sentiments = df_test["sentiment"]
# lists to hold text, sentiment
test_list = []
test_sentiment_list = []
for text, data in zip(texts, sentiments):
    text = str(text).split()
    test_list.append(text)
    test_sentiment_list.append(data)

In [4]:
print(len(train_list))

27481


In [5]:
## LOAD THE WORD VECTORS FROM THE PRETRAINED WORD2VEC MODEL
model = Word2Vec.load("word2vec_checkpoints/word2vec_80.model")
word_vectors = model.wv

The maximum number of words in a tweet in this dataset is 33, so 40 would be a good standard for sentence length.

In [42]:
## UNCOMMENT TO RECREATE THE DATASET IF IT'S NOT SAVED

# spell = SpellChecker()
# input_dataset = np.zeros((27481, 40, 80), dtype=float)
# for i, sentence in enumerate(train_list):
#     list_to_add = np.array([])
#     for word in sentence:
#         # if it's a link, add a vector of 0's
#         if word[0:4] == "http":
#             list_to_add = np.append(list_to_add, np.zeros(80))
#             continue
#         if "****" in word:
#             word = word.replace('****', 'censored')
#         str1 = ''
#         str2 = ''
#         switch = False
#         for char in word:
#             if char.isalpha() or char=="`" or char=="-":
#                 if switch:
#                     str2 = str2 + char
#                 else:
#                     str1 = str1 + char
#             else:
#                 switch = True
#         count = 1
#         tempChar = ''
#         newStr = ''
#         for char in str1:
#             if char == tempChar:
#                 count += 1
#             tempChar = char
#             if count < 3:
#                 newStr += char
#             else:
#                 newStr = newStr[:-1]
#                 continue
#         count = 1
#         tempChar = ''
#         newStr2 = ''
#         for char in str2:
#             if char == tempChar:
#                 count += 1
#             tempChar = char
#             if count < 3:
#                 newStr2 += char
#             else:
#                 newStr2 = newStr[:-1]
#                 continue
                
#         if newStr != "" and newStr != "`" and newStr != "-":
#             list_to_add = np.append(list_to_add, np.array(word_vectors[spell.correction(newStr.lower())]))
#         if newStr2 != "" and newStr2 != "`" and newStr2 != "-":
#             list_to_add = np.append(list_to_add, word_vectors[spell.correction(newStr2.lower())])
#     while list_to_add.shape[0] != 3200:
#         list_to_add = np.append(list_to_add, np.zeros(80))
#     list_to_add = np.reshape(list_to_add, [40, 80])
#     input_dataset[i] = list_to_add
#     if i % 100 == 0:
#         print("Addition number: %d" % i)

In [43]:
# print(len(input_dataset))
# input_array = []
# input_add = []
# for array in input_dataset:
#     for item in array:
#         input_add.append(item.tolist())
#     input_array.append(input_add)
# input_array = np.asarray(input_array)
# np.save("input_without_sentiment", input_dataset)

In [222]:
count_pos = 0
count_neg = 0
count_neut = 0
for sent in train_sentiment_list:
    if sent == "positive":
        count_pos += 1
    if sent == "negative":
        count_neg += 1
    else:
        count_neut += 1
print("Positive tweets: " + str(count_pos))
print("Negative tweets: " + str(count_neg))
print("Neutral tweets: " + str(count_neut))
print("Total non-neutral tweets: " + str(count_pos + count_neg))

Positive tweets: 8582
Negative tweets: 7781
Neutral tweets: 19700
Total non-neutral tweets: 16363


In [219]:
# TO LOAD THE DATASET (WITHOUT SENTIMENT)
train_array = np.load("input_without_sentiment.npy")

In [102]:
## UNCOMMENT TO CREATE THE DATASET WITH THE SENTIMENTS
# Concatanate the sentiment on each word
# # pos_array = np.ones((40, 80), dtype=float)
# # neg_array = -np.ones((40, 80), dtype=float)
# # neutral_array = np.zeros((40, 80), dtype=float)

# train_array_with_sent = np.zeros((train_array.shape[0], 40, 80, 2))
# pos_neg_array = 
# for i in range(train_array.shape[0]):
#     if train_sentiment_list[i] == "positive":
#         result = np.dstack((train_array[i], neutral_array))
#     elif train_sentiment_list[i] == "negative":
#         result = np.dstack((train_array[i], neutral_array))
#     else:
#         result = np.dstack((train_array[i], neutral_array))
#     train_array_with_sent[i] = result   

In [226]:
pos_neg_array = np.zeros((count_pos+count_neg, 40, 80))
neutral_array = np.zeros((count_neut, 40, 80))
pos_neg_index = 0
neut_index = 0
for i in range(train_array.shape[0]):
    if train_sentiment_list[i] == "positive" or train_sentiment_list[i] == "negative":
        pos_neg_array[pos_neg_index] = train_array[i]
        pos_neg_index += 1
    else:
        neutral_array[neut_index] = train_array[i]
        neut_index += 1
print(neutral_array.shape)
print(pos_neg_array.shape)

(19700, 40, 80)
(16363, 40, 80)


In [106]:
# np.save("input_with_sentiment", train_array_with_sent)

In [209]:
## UNCOMMENT TO CREATE THE LABELS FOR TRAINING
# label_train = np.zeros((len(train_list), 40), dtype=np.float32)
# label_train.fill(.2)
# for (num, item), label in zip(enumerate(train_list), label_list):
#     loc = 0
#     for i, word in enumerate(item):
#         if loc != len(label) and (label[loc] == word[0:len(label[loc])] or label[loc] == word[-len(label[loc]):]):
#             label_train[num][i] = .8
#             loc += 1
#         else:
#             loc = 0
# np.save("labels", label_train)

In [227]:
label_train = np.zeros((len(train_list), 40), dtype=np.float32)
label_train.fill(0.2)
for (num, item), label in zip(enumerate(train_list), label_list):
    loc = 0
    for i, word in enumerate(item):
        if loc != len(label) and (label[loc] == word[0:len(label[loc])] or label[loc] == word[-len(label[loc]):]):
            label_train[num][i] = 0.8
            loc += 1
        else:
            loc = 0
label_pos_neg = np.zeros((count_pos+count_neg, 40), dtype=np.float32)
label_neut = np.zeros((count_neut, 40), dtype=np.float32)
label_pos_neg.fill(0.2)
label_neut.fill(0.2)
neut_index = 0
pos_neg_index = 0
for i in range(len(label_list)):
    if train_sentiment_list[i] == "positive" or train_sentiment_list[i] == "negative":
        label_pos_neg[pos_neg_index] = label_train[i]
        pos_neg_index += 1
    else:
        label_neut[neut_index] = label_train[i]
        neut_index += 1
np.save("labels_pos_neg", label_pos_neg)
np.save("labels_neut", label_neut)

In [231]:
# labels = np.load("labels.npy")
# #train = np.load("input_wit_sentiment.npy")
# train = np.load("input_without_sentiment.npy")
pos_neg_labels = np.load("labels_pos_neg.npy")
neut_labels = np.load("labels_neut.npy")

pos_neg_train = pos_neg_array
neut_train = neutral_array

In [None]:
model = Sequential()

# model.add(Input(shape=(40, 80, 2), name='model_input'))
# model.add(Conv2D(filters=8, kernel_size=3, strides=2, padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(Conv2D(filters=16, kernel_size=3, strides=2, padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(Flatten())
# model.add(Dense(40, activation='relu'))

model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(20, return_sequences=True)))

model.add(Flatten())
model.add(Dense(40, activation='relu'))

model.build(input_shape=(None, 40, 80))
model.summary()

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['binary_crossentropy', 'accuracy'])

model.fit(pos_neg_train, pos_neg_labels,
          batch_size=2,
          epochs=50, 
          validation_split=0.1)
model.save("pos_neg_model.h5")

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_20 (Bidirectio multiple                  28928     
_________________________________________________________________
dropout_15 (Dropout)         multiple                  0         
_________________________________________________________________
bidirectional_21 (Bidirectio multiple                  24832     
_________________________________________________________________
dropout_16 (Dropout)         multiple                  0         
_________________________________________________________________
bidirectional_22 (Bidirectio multiple                  24832     
_________________________________________________________________
dropout_17 (Dropout)         multiple                  0         
_________________________________________________________________
bidirectional_23 (Bidirectio multiple                 

In [212]:
print(train[0].shape)
result = model.predict(np.reshape(train[1125], [1, 40, 80]))

(40, 80)


In [213]:
print(result)
print(label_list[1125])
print(train_list[1125])
print(train_sentiment_list[1125])

[[0.30501285 0.30679423 0.31108868 0.31498742 0.30972064 0.35368824
  0.42279556 0.54031336 0.69489515 0.54391897 0.50401616 0.24578306
  0.18780696 0.18869683 0.19251756 0.19301684 0.19562441 0.18737917
  0.18658817 0.19227588 0.19187032 0.18781409 0.18255234 0.18702105
  0.18850556 0.19334635 0.19075967 0.19518043 0.19530396 0.19848514
  0.19878377 0.19895178 0.2008979  0.20091653 0.2007593  0.20156424
  0.2008959  0.20103452 0.20172264 0.20042361]]
['the', 'free', 'fillin`', 'app', 'on', 'my', 'ipod', 'is', 'fun,', 'im', 'addicted']
['the', 'free', 'fillin`', 'app', 'on', 'my', 'ipod', 'is', 'fun,', 'im', 'addicted']
positive


In [214]:
## UNCOMMENT TO RECREATE THE TEST DATASET IF IT'S NOT SAVED

spell = SpellChecker()
test_dataset = np.zeros((len(test_list), 40, 80), dtype=float)
for i, sentence in enumerate(test_list):
    list_to_add = np.array([])
    for word in sentence:
        # if it's a link, add a vector of 0's
        if word[0:4] == "http":
            list_to_add = np.append(list_to_add, np.zeros(80))
            continue
        if "****" in word:
            word = word.replace('****', 'censored')
        str1 = ''
        str2 = ''
        switch = False
        for char in word:
            if char.isalpha() or char=="`" or char=="-":
                if switch:
                    str2 = str2 + char
                else:
                    str1 = str1 + char
            else:
                switch = True
        count = 1
        tempChar = ''
        newStr = ''
        for char in str1:
            if char == tempChar:
                count += 1
            tempChar = char
            if count < 3:
                newStr += char
            else:
                newStr = newStr[:-1]
                continue
        count = 1
        tempChar = ''
        newStr2 = ''
        for char in str2:
            if char == tempChar:
                count += 1
            tempChar = char
            if count < 3:
                newStr2 += char
            else:
                newStr2 = newStr[:-1]
                continue
                
        if newStr != "" and newStr != "`" and newStr != "-":
            list_to_add = np.append(list_to_add, np.array(word_vectors[spell.correction(newStr.lower())]))
        if newStr2 != "" and newStr2 != "`" and newStr2 != "-":
            list_to_add = np.append(list_to_add, word_vectors[spell.correction(newStr2.lower())])
    while list_to_add.shape[0] != 3200:
        list_to_add = np.append(list_to_add, np.zeros(80))
    list_to_add = np.reshape(list_to_add, [40, 80])
    test_dataset[i] = list_to_add
    if i % 100 == 0:
        print("Addition number: %d" % i)

Addition number: 0
Addition number: 100
Addition number: 200
Addition number: 300
Addition number: 400
Addition number: 500
Addition number: 600
Addition number: 700
Addition number: 800
Addition number: 900
Addition number: 1000
Addition number: 1100
Addition number: 1200
Addition number: 1300
Addition number: 1400
Addition number: 1500
Addition number: 1600
Addition number: 1700
Addition number: 1800
Addition number: 1900
Addition number: 2000
Addition number: 2100
Addition number: 2200
Addition number: 2300
Addition number: 2400
Addition number: 2500
Addition number: 2600
Addition number: 2700
Addition number: 2800
Addition number: 2900
Addition number: 3000
Addition number: 3100
Addition number: 3200
Addition number: 3300
Addition number: 3400
Addition number: 3500


In [217]:
print(test_dataset[0].shape)
result = model.predict(np.reshape(test_dataset[125], [1, 40, 80]))

(40, 80)


In [218]:
print(result)
print(test_list[125])
print(test_sentiment_list[125])

[[0.45378008 0.52638525 0.5755636  0.7686279  0.41885033 0.36972898
  0.35467526 0.36100286 0.36500722 0.34273344 0.35559586 0.3651407
  0.37919408 0.39935568 0.38550937 0.24221063 0.20775586 0.19622347
  0.19004185 0.18969296 0.19104597 0.19016522 0.1878237  0.19124797
  0.19144896 0.19519086 0.18997876 0.19505893 0.19710167 0.19690603
  0.19945234 0.1990623  0.2011739  0.20041107 0.20048621 0.20102806
  0.20071213 0.20041916 0.20153853 0.20101057]]
['man', 'im', 'so', 'sad', 'school', 'is', 'ending', 'but', 'then', 'again', 'high', 'school', 'might', 'be', 'better', ':O']
neutral
