In [65]:
import numpy as np
import tensorflow
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Input
from keras.layers import Flatten
from keras.layers.merge import concatenate
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.models import load_model
import pandas as pd
from bs4 import BeautifulSoup
import re
import statistics
from statistics import *
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, precision_recall_fscore_support

In [42]:
file = 'trainingObamaRomneytweets.xlsx'
Raw_file = pd.ExcelFile(file)
df_Romney = Raw_file.parse('Romney', skiprows = 1)
df_Romney = Raw_file.parse('Romney', skiprows = 1)
df_Romney = df_Romney[['1: positive, -1: negative, 0: neutral, 2: mixed', 'Class', 'Your class label']]
df_Romney.rename(columns={'1: positive, -1: negative, 0: neutral, 2: mixed': 'Tweets'}, inplace=True)
df_Romney.dropna(subset=['Tweets'], inplace=True)
df_Romney.dropna(subset=['Class'], inplace=True)
df_Romney = df_Romney[(df_Romney.Class == 0) | (df_Romney.Class == 1) | (df_Romney.Class == -1)]
df_Romney.head()

Unnamed: 0,Tweets,Class,Your class label
0,Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...,-1,
2,.@WardBrenda @shortwave8669 @allanbourdius you...,-1,
3,<e>Mitt Romney</e> still doesn't <a>believe</a...,-1,
4,<e>Romney</e>'s <a>tax plan</a> deserves a 2nd...,-1,
5,Hope <e>Romney</e> debate prepped w/ the same ...,1,


In [48]:
df_test_Romney = pd.read_csv("Romney_Test_dataset_NO_Label.csv", encoding = "iso-8859-1")
print(df_test_Romney.head())

   Tweet_ID                                         Tweet_text
0         1  <e>Romney</e> got 3 less minutes and had to de...
1         2  <e>Mitt  </e>is beating him UP!  on his record...
2         3  I actually like  <e>Romney </e>'s response to ...
3         4  Just for that <a>immigration statement </a>tha...
4         5  This man  <e>Romney  </e>is tearing this dude ...


In [49]:
def preprocesstweets(tweet):
    #if type(tweet) is str:
    tweet = tweet.lower()
    tweet = BeautifulSoup(tweet, "html.parser")
    tweet = tweet.get_text()
    tweet = re.sub(r"http\S+", '', tweet)
    tweet = re.sub(r'www.[^ ]+','', tweet)
    tweet = re.sub(r'@[A-Za-z0-9]+','',tweet)
    tweet = re.sub(r"[^A-Za-z\s]+", '', tweet)
    return tweet
    

df_Romney['Tweets'] = df_Romney['Tweets'].apply(preprocesstweets)
df_test_Romney['Tweet_text'] = df_test_Romney["Tweet_text"].apply(preprocesstweets)
print(df_Romney.head())
print(df_test_Romney.head())

                                              Tweets Class  Your class label
0  insidiousmitt romneys bain helped philip morri...    -1               NaN
2            you mean like romney cheated in primary    -1               NaN
3  mitt romney still doesnt believe that we have ...    -1               NaN
4  romneys tax plan deserves a nd look because he...    -1               NaN
5  hope romney debate prepped w the same people a...     1               NaN
   Tweet_ID                                         Tweet_text
0         1  romney got  less minutes and had to debate can...
1         2  mitt  is beating him up  on his record on cred...
2         3  i actually like  romney s response to immigration
3         4  just for that immigration statement that  romn...
4         5  this man  romney  is tearing this dude up on e...


In [50]:
lemmatizer = WordNetLemmatizer()
def create_tokens(tweet):
    #if type(tweet) is str:
    tweet = nltk.word_tokenize(tweet)
    tweet_tokens = []
    for i in tweet:
        temp = lemmatizer.lemmatize(i, pos='v')
        temp = lemmatizer.lemmatize(temp, pos='a')
        if (i not in stopwords.words('english')) & (len(i) > 1):
            tweet_tokens.append(temp)
    return tweet_tokens

def tokens_nonames(tweet):
    name_words = ['mitt','romney','barack','obama','baracks','obamas','mitts','romneys']
    tweet_tokens_nonames = []
    for i in tweet:
        if i not in name_words:
            tweet_tokens_nonames.append(i)
    return tweet_tokens_nonames
                      
df_Romney['Tokens'] = df_Romney['Tweets'].apply(create_tokens)
df_Romney['Tokens_nonames'] = df_Romney['Tokens'].apply(tokens_nonames)

df_test_Romney['Tokens'] = df_test_Romney['Tweet_text'].apply(create_tokens)
df_test_Romney['Tokens_nonames'] = df_test_Romney['Tokens'].apply(tokens_nonames)
df_Romney.head() 

Unnamed: 0,Tweets,Class,Your class label,Tokens,Tokens_nonames
0,insidiousmitt romneys bain helped philip morri...,-1,,"[insidiousmitt, romneys, bain, help, philip, m...","[insidiousmitt, bain, help, philip, morris, ge..."
2,you mean like romney cheated in primary,-1,,"[mean, like, romney, cheat, primary]","[mean, like, cheat, primary]"
3,mitt romney still doesnt believe that we have ...,-1,,"[mitt, romney, still, doesnt, believe, black, ...","[still, doesnt, believe, black, president]"
4,romneys tax plan deserves a nd look because he...,-1,,"[romneys, tax, plan, deserve, nd, look, secret...","[tax, plan, deserve, nd, look, secret, one, th..."
5,hope romney debate prepped w the same people a...,1,,"[hope, romney, debate, prepped, people, last, ...","[hope, debate, prepped, people, last, time]"


In [53]:
def detoken(tweet):
    detkn = ' '.join([i for i in tweet])
    return detkn
                      
df_Romney['Processed Tweets'] = df_Romney['Tokens_nonames'].apply(detoken)

df_test_Romney['Processed Tweets'] = df_test_Romney['Tokens_nonames'].apply(detoken)
df_Romney.head()
df_test_Romney.head()

Unnamed: 0,Tweet_ID,Tweet_text,Tokens,Tokens_nonames,Processed Tweets
0,1,romney got less minutes and had to debate can...,"[romney, get, less, minutes, debate, candy, cr...","[get, less, minutes, debate, candy, crowley, s...",get less minutes debate candy crowley still pe...
1,2,mitt is beating him up on his record on cred...,"[mitt, beat, record, credibility, character]","[beat, record, credibility, character]",beat record credibility character
2,3,i actually like romney s response to immigration,"[actually, like, romney, response, immigration]","[actually, like, response, immigration]",actually like response immigration
3,4,just for that immigration statement that romn...,"[immigration, statement, romney, answer, enoug...","[immigration, statement, answer, enough, get, ...",immigration statement answer enough get vote
4,5,this man romney is tearing this dude up on e...,"[man, romney, tear, dude, economics]","[man, tear, dude, economics]",man tear dude economics


In [56]:
tokenizer = Tokenizer()
bow_romney = tokenizer.fit_on_texts(df_Romney['Processed Tweets'].values)
bow_romney = tokenizer.texts_to_sequences(df_Romney['Processed Tweets'].values)


bow_test_romney = tokenizer.fit_on_texts(df_test_Romney['Processed Tweets'].values)
bow_test_romney = tokenizer.texts_to_sequences(df_test_Romney['Processed Tweets'].values)


In [57]:
pad_romney = pad_sequences(bow_romney, maxlen=300)
pad_test_romney = pad_sequences(bow_test_romney, maxlen=300)
pad_test_romney.shape

(1900, 300)

In [58]:
X_train = pad_romney
Y_train = df_Romney['Class']
Y_train_NN = pd.get_dummies(Y_train).values
X_test = pad_test_romney

In [59]:
embeddings_index = dict()
f = open("glove.6B.50d.txt", encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [60]:
print('Loaded word vectors.',len(embeddings_index))

Loaded word vectors. 400000


In [61]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
embedding_matrix = np.zeros((vocab_size, 50))
#print(embedding_matrix)

8661


In [64]:
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

(8661, 50)


In [78]:
model = Sequential()
model.add(Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=300, trainable=True))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(30, return_sequences=False))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 300, 50)           433050    
_________________________________________________________________
lstm_16 (LSTM)               (None, 300, 50)           20200     
_________________________________________________________________
lstm_17 (LSTM)               (None, 300, 50)           20200     
_________________________________________________________________
lstm_18 (LSTM)               (None, 30)                9720      
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 93        
Total params: 483,263
Trainable params: 483,263
Non-trainable params: 0
_________________________________________________________________


In [79]:
model.fit(X_train, Y_train_NN, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x28a6442e438>

In [80]:
model.save('model_LSTM.h5')

In [116]:
model_LSTM = load_model('model_LSTM.h5')

In [117]:
pred_LSTM = model_LSTM.predict(X_test)
ans = np.argmax(pred_LSTM, axis=1)

Y_LSTM= []
for i in ans:
    if i == 0:
        Y_LSTM.append(-1)
    if i == 1:
        Y_LSTM.append(0)
    if i == 2:
        Y_LSTM.append(1)

model_LSTM = load_model('model_LSTM.h5')

In [123]:
print(Y_LSTM)

[-1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 0, -1, -1, -1, -1, 0, 0, 1, -1, 1, -1, 1, 0, -1, -1, -1, -1, -1, -1, 1, -1, 0, 0, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 0, 1, 1, 1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, -1, 0, -1, 1, 0, -1, -1, -1, 0, -1, 0, 0, 1, -1, -1, -1, -1, 1, 0, 0, -1, -1, -1, -1, 0, 0, -1, 1, 1, 1, 0, -1, 1, -1, 0, -1, 1, -1, -1, -1, 1, 0, -1, -1, 1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 1, 0, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1, -1, 1, -1, 0, 1, -1, 1, -1, -1, 0, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 0, 1, -1, -1, -1, -1, -1, -1, 0, -1, 0, 0, -1, 1, 0, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 0, 1, -1, -1, -1, 1, -1, 1, -1, 0, 0, -1, -1, -1, 1, -1, 0, -1, -1, 0, 1, -1, -1, 1, 0, -1, -1, -1, -1, -1, 

In [124]:
Output = "praveen_chandrasekaran_saikrishnan_thiruvarpuneelakantan_Romney.txt"

def Filetxt(fname, output):
    f=open(fname, "w+")
    for i in range(len(output)):
        f.write(str(df_test_Romney.loc[i,"Tweet_ID"]) + ";;" + str(output[i]) + "\n")

Filetxt(Output,Y_LSTM)
