In [59]:
import pandas as pd
import os
import numpy as np
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Bidirectional, Dropout, Embedding, Conv2D, MaxPool2D, Reshape, TimeDistributed
from tensorflow.keras.optimizers import RMSprop, Adam

In [17]:
# Get the data into a pandas dataframe
train_file_path = "/Users/yigitatay/Desktop/SentNLP/data/train.csv"
df = pd.read_csv(train_file_path)

In [23]:
# separate labels and data
texts = df["text"]
selected_texts = df["selected_text"]
sentiments = df["sentiment"]

# a list to hold text, sentiment dictionaries
train_list = []
# a list to hold the labels
label_list = []

for text, data, label in zip(texts, sentiments, selected_texts):
    dict_to_add = dict()
    dict_to_add[text] = data
    train_list.append(dict_to_add)
    label_list.append(label)
    
# a list to hold specific text and label lists 
positive_train_list = []
negative_train_list = []
neutral_train_list = []
# a list to hold the labels
positive_label_list = []
negative_label_list = []
neutral_label_list = []

i = 0
for text, data, label in zip(texts, sentiments, selected_texts):
    if data == "positive":
        positive_train_list.append(text)
        positive_label_list.append(label)
    elif data == "negative":
        negative_train_list.append(text)
        negative_label_list.append(label)
    else:
        neutral_train_list.append(text)
        neutral_label_list.append(label)

In [6]:
# get strings with all text, positive, neutral and negative test
# for non-deep-learning analysis
all_text = ""
all_selected_text = ""
positive_text = ""
positive_selected_text = ""
negative_text = ""
negative_selected_text = ""
neutral_text = ""
neutral_selected_text = ""
for item, label in zip(train_list, label_list):
    all_selected_text += (" " + str(label))
    for key in item:
        all_text += (" " + str(key))
        if(item[key] == "positive"):
            positive_text += (" " + str(key))
            positive_selected_text += (" " + str(label))
        elif(item[key] == "negative"):
            negative_text += (" " + str(key))
            negative_selected_text += (" " + str(label))
        else:
            neutral_text += (" " + str(key))
            neutral_selected_text += (" " + str(label))


chars = sorted(list(set(all_text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [24]:
# set maximum tweet size to seqlen characters (dataset seems to have max about 140 chars)
# so we pad with spaces from the beginning for each tweet
seqlen = 160
modified_positive_train_list = []
i = 0
for tweet in positive_train_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_tweet = addition + str(tweet)
    modified_positive_train_list.append(new_tweet)
modified_positive_label_list = []
i = 0
for tweet in positive_label_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_tweet = addition + str(tweet)
    modified_positive_label_list.append(new_tweet)
    
modified_negative_train_list = []
i = 0
for tweet in negative_train_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_tweet = addition + str(tweet)
    modified_negative_train_list.append(new_tweet)
modified_negative_label_list = []
i = 0
for tweet in negative_label_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_tweet = addition + str(tweet)
    modified_negative_label_list.append(new_tweet)
    
modified_neutral_train_list = []
i = 0
for tweet in neutral_train_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_tweet = addition + str(tweet)
    modified_neutral_train_list.append(new_tweet)
modified_neutral_label_list = []
i = 0
for tweet in neutral_label_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_tweet = addition + str(tweet)
    modified_neutral_label_list.append(new_tweet)

In [31]:
# we add one extra dimension to the end
x_positive = np.zeros((len(modified_positive_train_list), seqlen, len(chars)+1), dtype=np.float32)
y_positive = np.zeros((len(modified_positive_label_list), seqlen, len(chars)+1), dtype=np.float32)
for i, tweet in enumerate(modified_positive_train_list):
    for t, char in enumerate(tweet):
        x_positive[i, t, char_indices[char]] = 1
for i, tweet in enumerate(modified_positive_label_list):
    for t, char in enumerate(tweet):
        y_positive[i, t, char_indices[char]] = 1      

In [60]:
model = Sequential()

model.add(Bidirectional(LSTM(128, input_shape=(160, 102), return_sequences=True)))
model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(32, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(32, return_sequences=True)))
# model.add(Dropout(0.2))
        
model.add(Bidirectional(LSTM(51, return_sequences=True)))


model.build(input_shape=(None, 160, 102))
model.summary()

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.0002),
    metrics=['binary_crossentropy', 'accuracy'])

model.fit(x_positive, y_positive,
          batch_size=8,
          epochs=50, 
          validation_split=0.1)
model.save("pos_model.h5")

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_22 (Bidirectio multiple                  236544    
_________________________________________________________________
dropout_16 (Dropout)         multiple                  0         
_________________________________________________________________
bidirectional_23 (Bidirectio multiple                  125664    
Total params: 362,208
Trainable params: 362,208
Non-trainable params: 0
_________________________________________________________________
Train on 7723 samples, validate on 859 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
 888/7723 [==>...........................] - ETA: 3:55 - loss: 0.0074 - binary_crossentropy: 0.0074 - accuracy: 0.9984

KeyboardInterrupt: 

In [33]:
# Get the data into a pandas dataframe
test_file_path = "/Users/yigitatay/Desktop/SentNLP/data/test.csv"
df = pd.read_csv(test_file_path)
# separate labels and data
texts = df["text"]
sentiments = df["sentiment"]

# a list to hold specific text and label lists 
positive_test_list = []
negative_test_list = []
neutral_test_list = []

for text, data in zip(texts, sentiments):
    if data == "positive":
        positive_test_list.append(text)
    elif data == "negative":
        negative_test_list.append(text)
    else:
        neutral_test_list.append(text)
            
modified_positive_test_list = []
i = 0
for tweet in positive_test_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_key = addition + str(tweet)
    modified_positive_test_list.append(new_key)
    
modified_negative_test_list = []
i = 0
for tweet in negative_test_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_key = addition + str(tweet)
    modified_negative_test_list.append(new_key)
    
modified_neutral_test_list = []
i = 0
for tweet in neutral_test_list:
    num_spaces = seqlen - len(str(tweet))
    addition = ""
    for i in range(num_spaces):
        addition += " "
    new_key = addition + str(tweet)
    modified_neutral_test_list.append(new_key)
    
# we add one extra dimension to the end
x_positive_test = np.zeros((len(modified_positive_test_list), seqlen, len(chars)+1), dtype=np.float32)
for j, tweet in enumerate(modified_positive_test_list):
    for t, char in enumerate(tweet):
        try:
            x_test[j, t, char_indices[char]] = 1
        except:
            continue
            
x_negative_test = np.zeros((len(modified_negative_test_list), seqlen, len(chars)+1), dtype=np.float32)
for j, tweet in enumerate(modified_negative_test_list):
    for t, char in enumerate(tweet):
        try:
            x_test[j, t, char_indices[char]] = 1
        except:
            continue
            
x_neutral_test = np.zeros((len(modified_neutral_test_list), seqlen, len(chars)+1), dtype=np.float32)
for j, tweet in enumerate(modified_neutral_test_list):
    for t, char in enumerate(tweet):
        try:
            x_test[j, t, char_indices[char]] = 1
        except:
            continue

In [57]:
#result = model.predict(x_positive_test[29])
result = model.predict(np.reshape(x_positive_test[1000], [1, 160, 102]))
print(result.shape)
#print(result)
result = np.reshape(result, [160, 102])
print(result)
result_str = ""
for line in result:
    #print(line)
    max_val = max(line)
    char_to_add = ''
    for i, val in enumerate(line):
        if val == max_val:
            try:
                char_to_add = indices_char[i]
            except:
                continue
    result_str = result_str + char_to_add
print(result_str)
print(modified_positive_test_list[1000])
print(modified_positive_label_list[1499])

(1, 160, 102)
[[ 2.99328216e-03  4.87615198e-01 -1.09308705e-01 ... -5.31909987e-03
  -3.69941853e-02 -1.07982345e-02]
 [-1.74443808e-03  7.89926767e-01 -1.26157522e-01 ... -1.56946294e-03
  -2.27546021e-02 -3.27158556e-03]
 [-1.34106469e-03  8.82185161e-01 -1.23498417e-01 ... -5.99721796e-04
  -1.56494994e-02 -1.22815336e-03]
 ...
 [-2.80891676e-02 -1.96931195e-02 -1.18359588e-02 ...  3.03440116e-04
  -8.36973730e-03  2.26170760e-05]
 [-3.42174396e-02 -9.45454240e-02  3.73802148e-02 ...  4.47577331e-04
  -7.42122391e-03  3.58385769e-05]
 [-3.51306498e-02 -1.26791731e-01  1.20272800e-01 ...  2.97072897e-04
  -5.86702954e-03 -6.97372598e-05]]
                                                                                                                                                            oooe
                                                                                                                           i`m done.haha. HOUSE MD marathon ulet
                            