Let's train a model for predicting IMDb Reviews and see how it does on our tweets (which have been self classified). If it does well, I would rather use this since it has been verified by more users.

In [1]:
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
import pandas as pd
import numpy as np
import tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
#reading csv files
train = pd.read_csv('~/Downloads/drive-download-20210309T002106Z-001/Train.csv')
valid = pd.read_csv('~/Downloads/drive-download-20210309T002106Z-001/Test.csv')             

#train_test split
x_tr, y_tr = train['text'].values, train['label'].values
x_val, y_val = valid['text'].values, valid['label'].values


#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_tr))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
x_val_seq = tokenizer.texts_to_sequences(x_val)

#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)

In [3]:
# embeddings_index = dict()
# f = open('/content/drive/MyDrive/UHP 195/crawl-300d-2M-subword.vec')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs

# f.close()
# print('Loaded %s word vectors.' % len(embeddings_index))

In [4]:

#embedding layer
size_of_vocabulary=len(tokenizer.word_index) + 1 
embedding_matrix = np.zeros((size_of_vocabulary, 300))

# none_vals = 0
# for word, i in tokenizer.word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector
#     else:
#         none_vals+=1
# print(none_vals/size_of_vocabulary)

In [5]:

model=Sequential()

model.add(Embedding(size_of_vocabulary,300,weights=[embedding_matrix],input_length=100,trainable=False)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('/content/drive/MyDrive/UHP 195/best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          33661200  
_________________________________________________________________
lstm (LSTM)                  (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 33,889,169
Trainable params: 227,969
Non-trainable params: 33,661,200
_________________________________________________________________
None


In [6]:
# history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=128,epochs=10,validation_data=(np.array(x_val_seq),np.array(y_val)),verbose=1,callbacks=[es,mc])

In [7]:
model.load_weights('../models/SentimentModel.h5')

In [8]:
trump_tweets = pd.read_csv("../data/Global_Control_Tweets.csv")

In [9]:
en_only = trump_tweets[trump_tweets['language'] == 'en']

In [10]:
# import matplotlib.pyplot as plt


In [11]:
# times = en_only[en_only['date'] == '2020-11-03']['time']

In [12]:
# times

In [13]:
# from collections import Counter
# time_count = Counter(times)

In [14]:
# for row in en_only.iterrows():
#     print(row)
#     break

In [15]:
# new_map = {}
# for time in time_count:
#     minute = int(time[:2])*60 + time[3:5]
#     if minute not in new_map:
#         new_map[minute] = time_count[time]
#     else:
#         new_map[minute] += time_count[time]

# x = []
# y = []
# for key in new_map:
#     x.append(key)
#     y.append(new_map[key])

In [16]:
# en_only['date'][2]

In [17]:
# import datetime
# date_time_str = en_only['date'][2] + ' ' + en_only['time'][2]
# date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
# date_time_obj

In [18]:
# plt.plot(x,y)

In [19]:
# trump_tweets['tweet'].values.astype(str)

In [20]:
# en_only

In [21]:
t = np.array(pad_sequences(tokenizer.texts_to_sequences(en_only['tweet'].values.astype(str)), maxlen=100))

In [22]:
# len(t)

In [None]:
pred = model.predict(t)

In [None]:
len(pred)

In [None]:
en_only.insert(10, 'Sentiment', pred)

In [None]:
en_only

In [None]:
drop_cols = ['near', 'geo', 'source', 'user_rt', 'user_rt_id', 'retweet_id', 'retweet_date', 'translate', 'trans_src', 'trans_dest']
new_tweets = en_only.drop(drop_cols, axis = 1)
new_tweets

In [None]:
pd.DataFrame.to_csv(new_tweets, '../data/Processed_global_control.csv')