In [40]:
import pandas as pd
import os
import numpy as np
from spellchecker import SpellChecker
import re

from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Bidirectional, \
                            Dropout, Embedding, Conv2D, MaxPool2D, Reshape, \
                            TimeDistributed, Activation, BatchNormalization, Input
from tensorflow.keras.optimizers import RMSprop, Adam

# TO RUN ON GPU, UNCOMMENT
# import tensorflow as tf
# config = tf.compat.v1.ConfigProto(device_count = {'GPU':2})
# sess = tf.compat.v1.Session(config=config)
# tf.compat.v1.keras.backend.set_session(sess)

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [101]:
# Get the data into a pandas dataframe
train_file_path = "/Users/yigitatay/Desktop/SentNLP/data/train.csv"
df = pd.read_csv(train_file_path)
test_file_path = "/Users/yigitatay/Desktop/SentNLP/data/test.csv"
df_test = pd.read_csv(test_file_path)

In [102]:
# separate labels and data
texts = df["text"]
selected_texts = df["selected_text"]
sentiments = df["sentiment"]
# lists to hold text, sentiment
train_list = []
train_sentiment_list = []
# a list to hold the labels
label_list = []
for text, data, label in zip(texts, sentiments, selected_texts):
    text = str(text).split()
    train_list.append(text)
    label = str(label).split()
    label_list.append(label)
    train_sentiment_list.append(data)
    
# separate labels and data
texts = df_test["text"]
sentiments = df_test["sentiment"]
# lists to hold text, sentiment
test_list = []
test_list_not_tokenized = []
test_sentiment_list = []
for text, data in zip(texts, sentiments):
    test_list_not_tokenized.append(text)
    text = str(text).split()
    test_list.append(text)
    test_sentiment_list.append(data)

In [43]:
print(len(train_list))

27481


In [44]:
## LOAD THE WORD VECTORS FROM THE PRETRAINED WORD2VEC MODEL
model = Word2Vec.load("word2vec_checkpoints/word2vec_80.model")
word_vectors = model.wv

The maximum number of words in a tweet in this dataset is 33, so 40 would be a good standard for sentence length.

In [45]:
## UNCOMMENT TO RECREATE THE DATASET IF IT'S NOT SAVED

# spell = SpellChecker()
# input_dataset = np.zeros((27481, 40, 80), dtype=float)
# for i, sentence in enumerate(train_list):
#     list_to_add = np.array([])
#     for word in sentence:
#         # if it's a link, add a vector of 0's
#         if word[0:4] == "http":
#             list_to_add = np.append(list_to_add, np.zeros(80))
#             continue
#         if "****" in word:
#             word = word.replace('****', 'censored')
#         str1 = ''
#         str2 = ''
#         switch = False
#         for char in word:
#             if char.isalpha() or char=="`" or char=="-":
#                 if switch:
#                     str2 = str2 + char
#                 else:
#                     str1 = str1 + char
#             else:
#                 switch = True
#         count = 1
#         tempChar = ''
#         newStr = ''
#         for char in str1:
#             if char == tempChar:
#                 count += 1
#             tempChar = char
#             if count < 3:
#                 newStr += char
#             else:
#                 newStr = newStr[:-1]
#                 continue
#         count = 1
#         tempChar = ''
#         newStr2 = ''
#         for char in str2:
#             if char == tempChar:
#                 count += 1
#             tempChar = char
#             if count < 3:
#                 newStr2 += char
#             else:
#                 newStr2 = newStr[:-1]
#                 continue
                
#         if newStr != "" and newStr != "`" and newStr != "-":
#             list_to_add = np.append(list_to_add, np.array(word_vectors[spell.correction(newStr.lower())]))
#         if newStr2 != "" and newStr2 != "`" and newStr2 != "-":
#             list_to_add = np.append(list_to_add, word_vectors[spell.correction(newStr2.lower())])
#     while list_to_add.shape[0] != 3200:
#         list_to_add = np.append(list_to_add, np.zeros(80))
#     list_to_add = np.reshape(list_to_add, [40, 80])
#     input_dataset[i] = list_to_add
#     if i % 100 == 0:
#         print("Addition number: %d" % i)
# np.save("input_without_sentiment", input_dataset)

In [46]:
## TO FIND THE COUNTS OF POSITIVE, NEGATIVE AND NEUTRAL TWEETS
count_pos = 0
count_neg = 0
count_neut = 0
for sent in train_sentiment_list:
    if sent == "positive":
        count_pos += 1
    if sent == "negative":
        count_neg += 1
    else:
        count_neut += 1
print("Positive tweets: " + str(count_pos))
print("Negative tweets: " + str(count_neg))
print("Neutral tweets: " + str(count_neut))
print("Total non-neutral tweets: " + str(count_pos + count_neg))

Positive tweets: 8582
Negative tweets: 7781
Neutral tweets: 19700
Total non-neutral tweets: 16363


In [47]:
# TO LOAD THE DATASET (WITHOUT SENTIMENT)
# train_array = np.load("input_without_sentiment.npy")

In [48]:
## UNCOMMENT TO CREATE THE DATASET WITH THE SENTIMENTS
# Concatanate the sentiment on each word
# # pos_array = np.ones((40, 80), dtype=float)
# # neg_array = -np.ones((40, 80), dtype=float)
# # neutral_array = np.zeros((40, 80), dtype=float)

# train_array_with_sent = np.zeros((train_array.shape[0], 40, 80, 2))
# pos_neg_array = 
# for i in range(train_array.shape[0]):
#     if train_sentiment_list[i] == "positive":
#         result = np.dstack((train_array[i], neutral_array))
#     elif train_sentiment_list[i] == "negative":
#         result = np.dstack((train_array[i], neutral_array))
#     else:
#         result = np.dstack((train_array[i], neutral_array))
#     train_array_with_sent[i] = result   

In [49]:
## CREATE DATASET FOR POS-NEG and NEUTRAL SEPARATELY
pos_neg_array = np.zeros((count_pos+count_neg, 40, 80))
neutral_array = np.zeros((count_neut, 40, 80))
pos_neg_index = 0
neut_index = 0
for i in range(train_array.shape[0]):
    if train_sentiment_list[i] == "positive" or train_sentiment_list[i] == "negative":
        pos_neg_array[pos_neg_index] = train_array[i]
        pos_neg_index += 1
    else:
        neutral_array[neut_index] = train_array[i]
        neut_index += 1
print(neutral_array.shape)
print(pos_neg_array.shape)

(19700, 40, 80)
(16363, 40, 80)


In [50]:
# np.save("input_with_sentiment", train_array_with_sent)

In [51]:
## UNCOMMENT TO CREATE THE LABELS FOR TRAINING
# label_train = np.zeros((len(train_list), 40), dtype=np.float32)
# label_train.fill(.2)
# for (num, item), label in zip(enumerate(train_list), label_list):
#     loc = 0
#     for i, word in enumerate(item):
#         if loc != len(label) and (label[loc] == word[0:len(label[loc])] or label[loc] == word[-len(label[loc]):]):
#             label_train[num][i] = .8
#             loc += 1
#         else:
#             loc = 0
# np.save("labels", label_train)

In [53]:
## CREATING LABELS FOR POS-NEG and NEUTRAL SEPARATELY

# The reason to fill with 0.2s and 0.8s instead of 0s and 1s is to
# make the training better and prevent vanishing gradients
label_train = np.zeros((len(train_list), 40), dtype=np.float32)
label_train.fill(0.2)
for (num, item), label in zip(enumerate(train_list), label_list):
    loc = 0
    for i, word in enumerate(item):
        if loc != len(label) and (label[loc] == word[0:len(label[loc])] or label[loc] == word[-len(label[loc]):]):
            label_train[num][i] = 0.8
            loc += 1
        else:
            loc = 0
label_pos_neg = np.zeros((count_pos+count_neg, 40), dtype=np.float32)
label_neut = np.zeros((count_neut, 40), dtype=np.float32)
label_pos_neg.fill(0.2)
label_neut.fill(0.2)
neut_index = 0
pos_neg_index = 0
for i in range(len(label_list)):
    if train_sentiment_list[i] == "positive" or train_sentiment_list[i] == "negative":
        label_pos_neg[pos_neg_index] = label_train[i]
        pos_neg_index += 1
    else:
        label_neut[neut_index] = label_train[i]
        neut_index += 1
np.save("labels_pos_neg", label_pos_neg)
np.save("labels_neut", label_neut)

In [54]:
# SOME OTHER SAVED ARRAYS WITHOUT POS-NEG / NEUT SEPARATION
# labels = np.load("labels.npy")
# train = np.load("input_wit_sentiment.npy")
# train = np.load("input_without_sentiment.npy")

pos_neg_labels = np.load("labels_pos_neg.npy")
neut_labels = np.load("labels_neut.npy")

pos_neg_train = pos_neg_array
neut_train = neutral_array

In [55]:
model = Sequential()

model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(20, return_sequences=True)))

model.add(Flatten())
model.add(Dense(40, activation='relu'))

# A CONVOLUTIONAL MODEL THAT DOESN'T WORK AS WELL
# model.add(Input(shape=(40, 80, 2), name='model_input'))
# model.add(Conv2D(filters=8, kernel_size=3, strides=2, padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(Conv2D(filters=16, kernel_size=3, strides=2, padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(Flatten())
# model.add(Dense(40, activation='relu'))

model.build(input_shape=(None, 40, 80))
model.summary()

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['binary_crossentropy', 'accuracy'])

model.fit(neut_train, neut_labels,
          batch_size=2,
          epochs=6, 
          validation_split=0.1)
model.save("neut_model.h5")

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_24 (Bidirectio multiple                  28928     
_________________________________________________________________
dropout_18 (Dropout)         multiple                  0         
_________________________________________________________________
bidirectional_25 (Bidirectio multiple                  24832     
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
_________________________________________________________________
bidirectional_26 (Bidirectio multiple                  24832     
_________________________________________________________________
dropout_20 (Dropout)         multiple                  0         
_________________________________________________________________
bidirectional_27 (Bidirectio multiple                 

In [56]:
## UNCOMMENT TO RECREATE THE TEST DATASET IF IT'S NOT SAVED

# spell = SpellChecker()
# test_dataset = np.zeros((len(test_list), 40, 80), dtype=float)
# for i, sentence in enumerate(test_list):
#     list_to_add = np.array([])
#     for word in sentence:
#         # if it's a link, add a vector of 0's
#         if word[0:4] == "http":
#             list_to_add = np.append(list_to_add, np.zeros(80))
#             continue
#         if "****" in word:
#             word = word.replace('****', 'censored')
#         str1 = ''
#         str2 = ''
#         switch = False
#         for char in word:
#             if char.isalpha() or char=="`" or char=="-":
#                 if switch:
#                     str2 = str2 + char
#                 else:
#                     str1 = str1 + char
#             else:
#                 switch = True
#         count = 1
#         tempChar = ''
#         newStr = ''
#         for char in str1:
#             if char == tempChar:
#                 count += 1
#             tempChar = char
#             if count < 3:
#                 newStr += char
#             else:
#                 newStr = newStr[:-1]
#                 continue
#         count = 1
#         tempChar = ''
#         newStr2 = ''
#         for char in str2:
#             if char == tempChar:
#                 count += 1
#             tempChar = char
#             if count < 3:
#                 newStr2 += char
#             else:
#                 newStr2 = newStr[:-1]
#                 continue
                
#         if newStr != "" and newStr != "`" and newStr != "-":
#             list_to_add = np.append(list_to_add, np.array(word_vectors[spell.correction(newStr.lower())]))
#         if newStr2 != "" and newStr2 != "`" and newStr2 != "-":
#             list_to_add = np.append(list_to_add, word_vectors[spell.correction(newStr2.lower())])
#     while list_to_add.shape[0] != 3200:
#         list_to_add = np.append(list_to_add, np.zeros(80))
#     list_to_add = np.reshape(list_to_add, [40, 80])
#     test_dataset[i] = list_to_add
#     if i % 100 == 0:
#         print("Addition number: %d" % i)
# np.save("test_dataset", test_dataset)

In [57]:
## LOAD THE TESTING DATA AND THE MODELS

test_data = np.load("test_dataset.npy")
pos_neg_model = Sequential()
pos_neg_model.add(Bidirectional(LSTM(32, return_sequences=True)))
pos_neg_model.add(Dropout(0.2))
pos_neg_model.add(Bidirectional(LSTM(32, return_sequences=True)))
pos_neg_model.add(Dropout(0.2))
pos_neg_model.add(Bidirectional(LSTM(32, return_sequences=True)))
pos_neg_model.add(Dropout(0.2))
pos_neg_model.add(Bidirectional(LSTM(20, return_sequences=True)))
pos_neg_model.add(Flatten())
pos_neg_model.add(Dense(40, activation='relu'))
pos_neg_model.build(input_shape=(None, 40, 80))

neutral_model = Sequential()
neutral_model.add(Bidirectional(LSTM(32, return_sequences=True)))
neutral_model.add(Dropout(0.2))
neutral_model.add(Bidirectional(LSTM(32, return_sequences=True)))
neutral_model.add(Dropout(0.2))
neutral_model.add(Bidirectional(LSTM(32, return_sequences=True)))
neutral_model.add(Dropout(0.2))
neutral_model.add(Bidirectional(LSTM(20, return_sequences=True)))
neutral_model.add(Flatten())
neutral_model.add(Dense(40, activation='relu'))
neutral_model.build(input_shape=(None, 40, 80))

pos_neg_model.load_weights("pos_neg_model.h5")
neutral_model.load_weights("neut_model.h5")

In [58]:
predictions = []
for i, test in enumerate(test_data):
    if test_sentiment_list[i] == "positive" or test_sentiment_list[i] == "negative":
        predictions.append(pos_neg_model.predict(np.reshape(test, [1, 40, 80])))
    else:
        predictions.append(neutral_model.predict(np.reshape(test, [1, 40, 80])))

In [87]:
test_selected_texts = []
# print(test_list[2007])
# print(test_sentiment_list[2007])
# print(predictions[2007])
for i, sentence in enumerate(test_list):
    string_to_add = ""
    max_elem = np.amax(predictions[i][0]) # if everything is under 0.5, just add to most likely word
    for j, word in enumerate(sentence):
        if max_elem < 0.5:
            if predictions[i][0][j] == max_elem:
                string_to_add = string_to_add + word
        if predictions[i][0][j] >= 0.5:
            if len(string_to_add) == 0:
                string_to_add = string_to_add + word
            else:
                string_to_add = string_to_add + " " + word
    test_selected_texts.append(string_to_add)

In [104]:
# Get the data into a pandas dataframe
submission_file_path = "/Users/yigitatay/Desktop/SentNLP/data/sample_submission.csv"
df = pd.read_csv(submission_file_path)
textIDs = df["textID"]

input_to_df = list(zip(textIDs, test_selected_texts))
final_df = pd.DataFrame(input_to_df, columns=["textID", "selected_text"])
final_df.to_csv("test_results.csv", index=False)

input_with_text = list(zip(textIDs, test_list_not_tokenized, test_selected_texts, test_sentiment_list))
with_text_df = pd.DataFrame(input_with_text, columns=['textID', 'text', 'selected_text', 'sentiment'])
with_text_df.to_csv("test_results_with_text.csv", index=False)