In [None]:
pip install spacy

In [8]:
# import necessary libraries
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, SimpleRNN, Activation, Dropout, Conv1D
from tensorflow.keras.layers import Embedding, Flatten, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
import spacy
from sklearn.metrics import classification_report

In [9]:
data = pd.read_csv('/Users/venkatavarunnelakuditi/Downloads/training1600000.csv', header=None, encoding='latin-1')
data.head()

The shape of the original dataset is (1600000, 6)


Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
data.isnull().any()

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [12]:
def load_glove_model(glove_file):
    print("[INFO]Loading GloVe Model...")
    model = {}
    with open(glove_file, 'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embeddings = [float(val) for val in split_line[1:]]
            model[word] = embeddings
    return model
nlp = spacy.load("en_core_web_sm")

def remove_stopwords(sentence):
    new = []
    # tokenize sentence
    sentence = nlp(sentence)
    for tk in sentence:
        if (tk.is_stop == False) & (tk.pos_ !="PUNCT"):
            new.append(tk.string.strip())
    # convert back to sentence string
    c = " ".join(str(x) for x in new)
    return c


def lemmatize(sentence):
    sentence = nlp(sentence)
    s = ""
    for w in sentence:
        s +=" "+w.lemma_
    return nlp(s)

def sent_vectorizer(sent, model):
    sent_vector = np.zeros(200)
    num_w = 0
    for w in sent.split():
        try:
            # add up all token vectors to a sent_vector
            sent_vector = np.add(sent_vector, model[str(w)])
            num_w += 1
        except:
            pass
    return sent_vector

In [13]:
data_X = data[data.columns[5]].to_numpy()
data_y = data[data.columns[0]]
data_y = pd.get_dummies(data_y).to_numpy()

In [14]:
# load the glove model
glove_model = load_glove_model("/Users/venkatavarunnelakuditi/Downloads/glovetwitter27B100d.txt")
# number of vocab to keep
max_vocab = 18000
# length of sequence that will generate
max_len = 15

tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(data_X)
sequences = tokenizer.texts_to_sequences(data_X)
word_index = tokenizer.word_index
data_keras = pad_sequences(sequences, maxlen=max_len, padding="post")

[INFO]Loading GloVe Model...
[INFO] Done...1193514 words loaded!
Found 690960 unique tokens.


In [15]:
from sklearn.model_selection import train_test_split
train_X, valid_X, train_y, valid_y = train_test_split(data_keras, data_y, test_size = 0.3, random_state=42)

In [16]:
# calcultaete number of words
nb_words = len(tokenizer.word_index) + 1

# obtain the word embedding matrix
embedding_matrix = np.zeros((nb_words, 100))
for word, i in word_index.items():
    embedding_vector = glove_model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 560169


In [17]:
def build_model(nb_words, embedding_matrix=None):
    '''
    build_model function:
    inputs: 
        rnn_model - which type of RNN layer to use, choose in (SimpleRNN, LSTM, GRU)
        embedding_matrix - whether to use pretrained embeddings or not
    '''
    model = Sequential()
    # add an embedding layer
    if embedding_matrix is not None:
        model.add(Embedding(nb_words, 
                        100, 
                        weights=[embedding_matrix], 
                        input_length= max_len,
                        trainable = False))
    else:
        model.add(Embedding(nb_words, 
                        100, 
                        input_length= max_len,
                        trainable = False))
        
    # add an RNN layer according to rnn_model
    model.add(LSTM(200))
    # model.add(Dense(500,activation='relu'))
    # model.add(Dense(500, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])
    return model

In [18]:
model_rnn = build_model(nb_words, embedding_matrix)
model_rnn.fit(train_X, train_y, epochs=20, batch_size=120,
          validation_data=(valid_X, valid_y), callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3))
predictions = model_rnn.predict(valid_X)
predictions = predictions.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), predictions))

Epoch 1/20

KeyboardInterrupt: 

In [16]:
model_rnn.save("lstm.h5")

In [19]:
from numpy import loadtxt
from keras.models import load_model
# Load the previously saved weights
model=load_model("lstm.h5")

In [20]:
data = pd.read_csv('output.csv')
data.head()

Unnamed: 0,tweet,location,favorites,followers,timestamp,sentiment,prediction
0,Financial records reveal Joe Biden had $5.2mil...,Null,1.0,0.0,Sat Apr 30 19:52:31 +0000 2022,4.0,tech
1,Glory be to Ukraines dedicated military,Null,1.0,102.0,Sat Apr 30 19:52:31 +0000 2022,4.0,entertainment
2,RT @Osinttechnical: Ukrainian forces continue ...,Null,1.0,42.0,Sat Apr 30 19:52:31 +0000 2022,4.0,business
3,RT @oryxspioenkop: Answering The Call: Heavy W...,,,,,4.0,business
4,Updated with:,,,,,4.0,tech


In [21]:
data2 = pd.read_csv('output2.csv')
data2.head()

Unnamed: 0,tweet,location,favorites,followers,timestamp,sentiment,prediction
0,RT @letvar5: A thread about interesting things...,fs0c131y@protonmail.com,1.0,243185.0,Sat Apr 30 21:15:34 +0000 2022,4.0,tech
1,RT @JesseKellyDC: Remember when we vaporized 1...,Null,1.0,9.0,Sat Apr 30 21:15:34 +0000 2022,4.0,politics
2,RT @RonnyJacksonTX: This White House refuses t...,Null,1.0,1824.0,Sat Apr 30 21:15:34 +0000 2022,4.0,politics
3,RT @Jim_Jordan: Will Joe Biden’s “disinformati...,SOL System PLANET Earth,1.0,3.0,Sat Apr 30 21:15:34 +0000 2022,4.0,entertainment
4,RT @anders_aslund: This is at least the 9th ge...,Null,1.0,13.0,Sat Apr 30 21:15:34 +0000 2022,4.0,business


In [22]:
data3 = pd.read_csv('output3.csv')
data3.head()

Unnamed: 0,tweet,location,favorites,followers,timestamp,sentiment,prediction
0,RT @BarrysComputer: Trump diehards in the Hous...,"California, USA",1.0,5683.0,Sat Apr 30 23:13:31 +0000 2022,4.0,politics
1,"@washingtonpost Good, hold Biden and DHS respo...",Null,1.0,11.0,Sat Apr 30 23:13:31 +0000 2022,4.0,politics
2,@jmtreymerritt @SchmouPoo1 @AaronParnas HORSE ...,,,,,4.0,sport
3,The Saudis are now siding with Putin over Bide...,"Nebraska, USA",1.0,3839.0,Sat Apr 30 23:13:34 +0000 2022,4.0,business
4,RT @chefjoseandres: .@WCKitchen way! #ChefsFor...,Null,1.0,352.0,Sat Apr 30 23:13:34 +0000 2022,4.0,tech


In [23]:
df=pd.concat([data,data2,data3])

In [24]:
df.count()

tweet         330979
location      229303
favorites     229267
followers     229243
timestamp     229243
sentiment     330979
prediction    330979
dtype: int64

In [31]:
data_X = df["tweet"].to_numpy()
sequences = tokenizer.texts_to_sequences(data_X)
data_keras = pad_sequences(sequences, maxlen=max_len, padding="post")

In [32]:
predictions = model.predict(data_keras)
predictions = predictions.argmax(axis=1)

In [33]:
df["LSTM_LEARNED_SENTIMENTS"]=predictions

In [35]:
df.to_csv("outputFromLSTM")

In [None]:
df.