In [20]:
import pandas as pd
import os
import numpy as np
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Bidirectional, \
                            Dropout, Embedding, Conv2D, MaxPool2D, Reshape, \
                            TimeDistributed, Activation, BatchNormalization, Input
from tensorflow.keras.optimizers import RMSprop, Adam

# TO RUN ON GPU, UNCOMMENT
# import tensorflow as tf
# config = tf.compat.v1.ConfigProto(device_count = {'GPU':2})
# sess = tf.compat.v1.Session(config=config)
# tf.compat.v1.keras.backend.set_session(sess)

In [2]:
# Get the data into a pandas dataframe
train_file_path = "/Users/yigitatay/Desktop/SentNLP/data/train.csv"
df = pd.read_csv(train_file_path)

In [3]:
# separate labels and data
texts = df["text"]
selected_texts = df["selected_text"]
sentiments = df["sentiment"]
# a list to hold text, sentiment dictionaries
train_list = []
# a list to hold the labels
label_list = []

for text, data, label in zip(texts, sentiments, selected_texts):
    dict_to_add = dict()
    dict_to_add[text] = data
    train_list.append(dict_to_add)
    label_list.append(label)

In [4]:
# get strings with all text, positive, neutral and negative test
# for non-deep-learning analysis
all_text = ""
all_selected_text = ""
positive_text = ""
positive_selected_text = ""
negative_text = ""
negative_selected_text = ""
neutral_text = ""
neutral_selected_text = ""
for item, label in zip(train_list, label_list):
    all_selected_text += (" " + str(label))
    for key in item:
        all_text += (" " + str(key))
        if(item[key] == "positive"):
            positive_text += (" " + str(key))
            positive_selected_text += (" " + str(label))
        elif(item[key] == "negative"):
            negative_text += (" " + str(key))
            negative_selected_text += (" " + str(label))
        else:
            neutral_text += (" " + str(key))
            neutral_selected_text += (" " + str(label))


In [5]:
chars = sorted(list(set(all_text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [6]:
# set maximum tweet size to seqlen characters (dataset seems to have max about 140 chars)
# so we pad with spaces from the beginning for each tweet
seqlen = 160
modified_train_list = []
i = 0
for tweet in train_list:
    for key in tweet:
        val = tweet[key]
        num_spaces = seqlen - len(str(key))
        addition = ""
        for i in range(num_spaces):
            addition += " "
        new_key = addition + str(key)
        dict_add = {}
        dict_add[new_key] = val
        modified_train_list.append(dict_add)
# modified_label_list = []
# i = 0
# for tweet in label_list:
#     num_spaces = seqlen - len(str(tweet))
#     addition = ""
#     for i in range(num_spaces):
#         addition += " "
#     new_tweet = addition + str(tweet)
#     modified_label_list.append(new_tweet)

In [7]:
modified_label_list = []
i = 0
for selected, original in zip(label_list, modified_train_list):
    selected = str(selected)
    modified_label = []
    original_str = ""
    whitespace_count = 0
    for key_original in original:
        original_str = key_original
    j = 0
    selected_len = len(selected)
    while original_str[j:(j+selected_len)] != selected:
        whitespace_count += 1
        j += 1
    for k in range(whitespace_count):
        modified_label.append(0)
    for k in range(selected_len):
        modified_label.append(1)
    while len(modified_label) != 160:
        modified_label.append(0)
    modified_label_list.append(modified_label)    

In [8]:
modified_label_list = np.asarray(modified_label_list)

In [15]:
# we add one extra dimension to the end
x = np.zeros((len(modified_train_list), seqlen, len(chars)+1), dtype=np.float32)
#y = np.zeros((len(modified_train_list), seqlen, len(chars)+1), dtype=np.float32)
for i, tweet in enumerate(modified_train_list):
    for key in tweet:
        for t, char in enumerate(key):
            x[i, t, char_indices[char]] = 1
            if tweet[key] == "positive":
                x[i, t, len(chars)] = 2
            elif tweet[key] == "negative":
                x[i, t, len(chars)] = 1
            else:
                x[i, t, len(chars)] = 0
# for i, tweet in enumerate(modified_label_list):
#     for t, char in enumerate(tweet):
#         y[i, t, char_indices[char]] = 1
y = modified_label_list

In [17]:
print(y.shape)
x = np.reshape(x, [27481, 160, 102, 1])
print(x.shape)

(27481, 160)
(27481, 160, 102, 1)


In [21]:
model = Sequential()

model.add(Input(shape=(160, 102, 1), name='model_input'))
model.add(Conv2D(filters=8, kernel_size=3, strides=2, padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(filters=16, kernel_size=3, strides=2, padding='same', activation='relu'))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(160, activation='sigmoid'))



# model.add(Bidirectional(LSTM(128, input_shape=(160, 102), return_sequences=True)))
# model.add(Dropout(0.2))


# model.add(Bidirectional(LSTM(32, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(32, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(32, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(51, return_sequences=True)))
# model.add(Activation('sigmoid'))

# model.add(Flatten())
# model.add(Dense(160, activation='sigmoid'))

model.build(input_shape=(None, 160, 102))
model.summary()

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['binary_crossentropy', 'accuracy'])

#y = np.reshape(y, [27481, 160*102])
print(y.shape)
#y = np.reshape(y, [27481, 160, 1])
model.fit(x, y,
          batch_size=1,
          epochs=50, 
          validation_split=0.1)
model.save("sentiment_model_big_y_with_sigmoid_50_epochs.h5")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 80, 51, 8)         80        
_________________________________________________________________
batch_normalization (BatchNo (None, 80, 51, 8)         32        
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 40, 26, 16)        1168      
_________________________________________________________________
batch_normalization_1 (Batch (None, 40, 26, 16)        64        
_________________________________________________________________
flatten (Flatten)            (None, 16640)             0         
_________________________________________________________________
dense (Dense)                (None, 160)               2662560   
Total params: 2,663,904
Trainable params: 2,663,856
Non-trainable params: 48
___________________________________________

KeyboardInterrupt: 

In [44]:
#model.save("sentiment_model_2_epochs.h5")
model.save("sentiment_model_small_y_50_epochs.h5")

In [166]:
# Get the data into a pandas dataframe
test_file_path = "/Users/yigitatay/Desktop/SentNLP/data/test.csv"
df = pd.read_csv(test_file_path)
# separate labels and data
texts = df["text"]
sentiments = df["sentiment"]
# a list to hold text, sentiment dictionaries
test_list = []

for text, data in zip(texts, sentiments):
    dict_to_add = dict()
    dict_to_add[text] = data
    test_list.append(dict_to_add)
    
# get strings with all text, positive, neutral and negative test
# for non-deep-learning analysis
all_text = ""
positive_text = ""
negative_text = ""
neutral_text = ""
for item in test_list:
    for key in item:
        all_text += (" " + str(key))
        if(item[key] == "positive"):
            positive_text += (" " + str(key))
        elif(item[key] == "negative"):
            negative_text += (" " + str(key))
        else:
            neutral_text += (" " + str(key))
            
modified_test_list = []
i = 0
for tweet in test_list:
    for key in tweet:
        val = tweet[key]
        num_spaces = seqlen - len(str(key))
        addition = ""
        for i in range(num_spaces):
            addition += " "
        new_key = addition + str(key)
        dict_add = {}
        dict_add[new_key] = val
        modified_test_list.append(dict_add)
# we add one extra dimension to the end
print(char_indices['i'])
x_test = np.zeros((len(modified_test_list), seqlen, len(chars)+1), dtype=np.float32)
for j, tweet in enumerate(modified_test_list):
    for key in tweet:
        for t, char in enumerate(key):
            try:
                x_test[j, t, char_indices[char]] = 1
            except:
                continue
            if tweet[key] == "positive":
                x_test[i, t, len(chars)] = 2
            elif tweet[key] == "negative":
                x_test[i, t, len(chars)] = 1
            else:
                x_test[i, t, len(chars)] = 0
print(x_test.shape)

73
(3534, 160, 102)


In [24]:
#print(x[94])
#result = model.predict(x_test[29])
result = model.predict(np.reshape(x[191], [1, 160, 102, 1]))
print(result.shape)


#result = np.reshape(result, [160, 102])
print(result)
result_str = ""
for line in result:
    #print(line)
    min_val = max(line)
    char_to_add = ''
    for i, val in enumerate(line):
        if val == min_val:
            char_to_add = indices_char[i]
    result_str = result_str + char_to_add
print(result_str)

(1, 160)
[[1.4732123e-08 7.9621767e-09 1.8425641e-08 3.8488550e-09 9.8294075e-09
  2.1445988e-08 9.4358619e-09 1.4148192e-08 1.8811814e-09 1.7070340e-08
  1.6420739e-08 9.1883177e-09 7.6642079e-09 4.2274384e-09 3.8533430e-09
  1.4433185e-08 9.3230144e-09 2.1185951e-08 9.2920001e-09 7.9632549e-09
  1.8301545e-08 3.0111995e-08 9.9489844e-06 2.1975047e-04 2.9175042e-03
  5.0318553e-03 6.3640155e-02 6.2087335e-02 1.7028089e-01 9.2650406e-02
  7.9224652e-01 8.9297527e-01 9.3339097e-01 9.3311417e-01 9.7983974e-01
  8.4829724e-01 4.8988199e-01 8.2814246e-01 7.7367580e-01 5.8356589e-01
  6.7528450e-01 7.3535204e-01 6.2919730e-01 5.7524884e-01 5.9493023e-01
  4.7282970e-01 5.5339354e-01 4.6021205e-01 3.3774406e-01 2.8125969e-01
  6.7550969e-01 7.1228999e-01 5.4452294e-01 4.6292257e-01 2.1744452e-01
  2.1123379e-01 1.3862301e-01 4.6564028e-02 1.7438577e-02 1.5830830e-02
  4.1960400e-02 5.4077387e-02 6.3835241e-02 1.0427474e-01 7.1503490e-02
  6.7876332e-02 3.0412534e-02 1.2042502e-02 3.3279501e-

In [23]:
result_str = ""
i = 0
for line in train_list:
    for key in line:
        if i == 29:
            print(key)
    i += 1

# label_str = ""
# for line in y[29]:
#     char_to_add = ''
#     for i, val in enumerate(line):
#         if val == 1:
#             char_to_add = indices_char[i]
#     label_str = label_str + char_to_add
# print(label_str)

print(label_list[126])

Went to sleep and there is a power cut in Noida  Power back up not working too
at dads, watching some mtv and am going on sims2 in a minutee
