In [429]:
import numpy as np
import tensorflow
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Input
from keras.layers import Flatten
from keras.layers.merge import concatenate
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.models import load_model
import pandas as pd
from bs4 import BeautifulSoup
import re
import statistics
from statistics import *
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, precision_recall_fscore_support

In [430]:
file = 'trainingObamaRomneytweets.xlsx'
Raw_file = pd.ExcelFile(file)
df_Obama = Raw_file.parse('Obama',skiprows = 1)
df_Obama = df_Obama[['1: positive, -1: negative, 0: neutral, 2: mixed', 'Class', 'Your class']]
df_Obama.rename(columns={'1: positive, -1: negative, 0: neutral, 2: mixed': 'Tweets'}, inplace=True)
df_Obama.dropna(subset=['Tweets'], inplace=True)
df_Obama.dropna(subset=['Class'], inplace=True)
df_Obama = df_Obama[(df_Obama.Class == 0) | (df_Obama.Class == 1) | (df_Obama.Class == -1)]
df_Obama.head()

Unnamed: 0,Tweets,Class,Your class
0,"Kirkpatrick, who wore a baseball cap embroider...",0,
2,#<e>obama</e> debates that Cracker Ass Cracker...,1,
4,@Hollivan @hereistheanswer Youre missing the ...,0,
6,I was raised as a Democrat left the party yea...,-1,
7,The <e>Obama camp</e> can't afford to lower ex...,0,


In [431]:
df_test_Obama = pd.read_csv("Obama_Test_dataset_NO_Label.csv", encoding = "iso-8859-1")
print(df_test_Obama.head())

   Tweet_ID                                         Tweet_text
0         1  <e>Obama</e> has to maintain his professionali...
1         2  <e>Obama</e> went into the debate swinging and...
2         3  Ditto. I started @247LS 4 years ago. RT @bmorr...
3         4  I absolutely love <e>Obama</e>'s view in <a>im...
4         5  I'm agreeing completely with <e>Obama</e>'s st...


In [226]:
#df_Obama = pd.concat([df_Obama, df_Romney])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [432]:
def preprocesstweets(tweet):
    #if type(tweet) is str:
    tweet = tweet.lower()
    tweet = BeautifulSoup(tweet, "html.parser")
    tweet = tweet.get_text()
    tweet = re.sub(r"http\S+", '', tweet)
    tweet = re.sub(r'www.[^ ]+','', tweet)
    tweet = re.sub(r'@[A-Za-z0-9]+','',tweet)
    tweet = re.sub(r"[^A-Za-z\s]+", '', tweet)
    return tweet
    

df_Obama['Tweets'] = df_Obama['Tweets'].apply(preprocesstweets)
df_test_Obama['Tweet_text'] = df_test_Obama['Tweet_text'].apply(preprocesstweets)
df_Obama.head()

Unnamed: 0,Tweets,Class,Your class
0,kirkpatrick who wore a baseball cap embroidere...,0,
2,obama debates that cracker ass cracker tonight...,1,
4,youre missing the point im afraid you do n...,0,
6,i was raised as a democrat left the party yea...,-1,
7,the obama camp cant afford to lower expectatio...,0,


In [433]:
lemmatizer = WordNetLemmatizer()
def create_tokens(tweet):
    #if type(tweet) is str:
    tweet = nltk.word_tokenize(tweet)
    tweet_tokens = []
    for i in tweet:
        temp = lemmatizer.lemmatize(i, pos='v')
        temp = lemmatizer.lemmatize(temp, pos='a')
        if (i not in stopwords.words('english')) & (len(i) > 1):
            tweet_tokens.append(temp)
    return tweet_tokens

def tokens_nonames(tweet):
    name_words = ['mitt','romney','barack','obama','baracks','obamas','mitts','romneys']
    tweet_tokens_nonames = []
    for i in tweet:
        if i not in name_words:
            tweet_tokens_nonames.append(i)
    return tweet_tokens_nonames
                      
df_Obama['Tokens'] = df_Obama['Tweets'].apply(create_tokens)
df_Obama['Tokens_nonames'] = df_Obama['Tokens'].apply(tokens_nonames)
df_test_Obama['Tokens'] = df_test_Obama['Tweet_text'].apply(create_tokens)
df_test_Obama['Tokens_nonames'] = df_test_Obama['Tokens'].apply(tokens_nonames)
df_Obama.head() 

Unnamed: 0,Tweets,Class,Your class,Tokens,Tokens_nonames
0,kirkpatrick who wore a baseball cap embroidere...,0,,"[kirkpatrick, wear, baseball, cap, embroider, ...","[kirkpatrick, wear, baseball, cap, embroider, ..."
2,obama debates that cracker ass cracker tonight...,1,,"[obama, debate, cracker, ass, cracker, tonight...","[debate, cracker, ass, cracker, tonight, tune,..."
4,youre missing the point im afraid you do n...,0,,"[youre, miss, point, im, afraid, understand, b...","[youre, miss, point, im, afraid, understand, b..."
6,i was raised as a democrat left the party yea...,-1,,"[raise, democrat, leave, party, years, ago, li...","[raise, democrat, leave, party, years, ago, li..."
7,the obama camp cant afford to lower expectatio...,0,,"[obama, camp, cant, afford, low, expectations,...","[camp, cant, afford, low, expectations, tonigh..."


In [434]:
def detoken(tweet):
    detkn = ' '.join([i for i in tweet])
    return detkn
                      
df_Obama['Processed Tweets'] = df_Obama['Tokens_nonames'].apply(detoken)
df_test_Obama['Processed Tweets'] = df_test_Obama['Tokens_nonames'].apply(detoken)
df_Obama.head()

Unnamed: 0,Tweets,Class,Your class,Tokens,Tokens_nonames,Processed Tweets
0,kirkpatrick who wore a baseball cap embroidere...,0,,"[kirkpatrick, wear, baseball, cap, embroider, ...","[kirkpatrick, wear, baseball, cap, embroider, ...",kirkpatrick wear baseball cap embroider signat...
2,obama debates that cracker ass cracker tonight...,1,,"[obama, debate, cracker, ass, cracker, tonight...","[debate, cracker, ass, cracker, tonight, tune,...",debate cracker ass cracker tonight tune teamobama
4,youre missing the point im afraid you do n...,0,,"[youre, miss, point, im, afraid, understand, b...","[youre, miss, point, im, afraid, understand, b...",youre miss point im afraid understand big pict...
6,i was raised as a democrat left the party yea...,-1,,"[raise, democrat, leave, party, years, ago, li...","[raise, democrat, leave, party, years, ago, li...",raise democrat leave party years ago lifetime ...
7,the obama camp cant afford to lower expectatio...,0,,"[obama, camp, cant, afford, low, expectations,...","[camp, cant, afford, low, expectations, tonigh...",camp cant afford low expectations tonights deb...


In [435]:
tokenizer = Tokenizer()
bow_obama = tokenizer.fit_on_texts(df_Obama['Processed Tweets'].values)
bow_obama = tokenizer.texts_to_sequences(df_Obama['Processed Tweets'].values)

bow_test_obama = tokenizer.fit_on_texts(df_test_Obama['Processed Tweets'].values)
bow_test_obama = tokenizer.texts_to_sequences(df_test_Obama['Processed Tweets'].values)
#print(tokenizer.word_index)
#bow_obama


In [437]:
pad_obama = pad_sequences(bow_obama, maxlen=300)
pad_test_obama = pad_sequences(bow_test_obama, maxlen=300)
pad_obama.shape

(5470, 300)

In [438]:
X_train = pad_obama
Y_train = df_Obama['Class']
X_test = pad_test_obama
Y_train_NN = pd.get_dummies(Y_train).values

In [439]:
embeddings_index = dict()
f = open("glove.6B.50d.txt", encoding="utf8")
for line in f:
    #print(line)
    values = line.split()
    #print(values)
    word = values[0]
    #print(word)
    coefs = np.asarray(values[1:], dtype='float32')
    #print(coefs)
    embeddings_index[word] = coefs
    #print(embeddings_index)
f.close()

In [440]:
print('Loaded word vectors.',len(embeddings_index))

Loaded word vectors. 400000


In [441]:
vocab_size = len(tokenizer.word_index) + 1
#print(vocab_size)
embedding_matrix = np.zeros((vocab_size, 50))
#print(embedding_matrix)

In [442]:
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    #print(word)
    #print(embedding_vector)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [467]:
model = Sequential()
#print(vocab_size)
#print(embedding_matrix[4])

In [468]:
model.add(Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=300, trainable=True))

In [469]:
model.add(LSTM(30, return_sequences=True))
model.add(LSTM(30, return_sequences=True))
model.add(LSTM(20, return_sequences=False))

In [470]:
model.add(Dense(3, activation='softmax'))

In [471]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [472]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 300, 50)           454950    
_________________________________________________________________
lstm_43 (LSTM)               (None, 300, 30)           9720      
_________________________________________________________________
lstm_44 (LSTM)               (None, 300, 30)           7320      
_________________________________________________________________
lstm_45 (LSTM)               (None, 20)                4080      
_________________________________________________________________
dense_25 (Dense)             (None, 3)                 63        
Total params: 476,133
Trainable params: 476,133
Non-trainable params: 0
_________________________________________________________________


In [484]:
model.fit(X_train, Y_train_NN, epochs=4, batch_size=64, validation_split= 0.1)

Train on 4923 samples, validate on 547 samples
Epoch 1/4
 128/4923 [..............................] - ETA: 1:19 - loss: 0.6669 - acc: 0.7500

KeyboardInterrupt: 

In [474]:
model.save('model_LSTM_Obama.h5')

In [485]:
model_LSTM = load_model('model_LSTM_Obama.h5')

In [486]:
pred_LSTM = model_LSTM.predict(X_test)
ans = np.argmax(pred_LSTM, axis=1)

Y_LSTM= []
for i in ans:
    if i == 0:
        Y_LSTM.append(-1)
    if i == 1:
        Y_LSTM.append(0)
    if i == 2:
        Y_LSTM.append(1)

In [487]:
print(Y_LSTM)

[0, 1, -1, -1, -1, 0, 1, 0, 0, -1, 0, 1, 1, 0, -1, -1, 0, -1, 1, 0, 0, 0, 0, -1, 1, 1, 1, -1, 1, 0, 1, 0, -1, -1, 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 1, -1, 0, -1, -1, 0, 0, 0, 1, 0, -1, -1, 0, 1, -1, 1, 1, 1, 0, 0, 1, 1, -1, -1, 1, -1, 0, -1, 1, 1, 1, 0, -1, 1, 0, 1, 1, 0, 1, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, -1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, -1, 0, 0, -1, 0, -1, -1, -1, 1, -1, -1, 1, 0, -1, -1, 0, 0, 0, 1, 1, 1, 0, -1, 0, 1, 0, 1, 1, 0, 0, -1, 1, 1, 1, -1, 1, 0, 1, 0, 1, 0, 0, -1, 1, 1, -1, 0, 1, 1, 1, 1, 0, -1, 0, 1, 0, 0, 1, 0, 0, 1, -1, 0, 1, 1, 0, 0, -1, -1, 1, 1, -1, 1, -1, 0, 0, -1, 1, 0, -1, 1, 1, -1, 0, -1, 1, 0, 1, 0, 1, -1, 0, 1, -1, -1, 1, 1, 0, -1, 1, -1, -1, 0, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, 0, 1, 0, -1, 0, 1, 0, 1, 1, 1, -1, 1, 0, 0, -1, 0, 0, -1, 0, -1, -1, 1, 0, 1, 0, 1, 0, -1, 0, 1, 0, 1, 1, 1, -1, 0, -1, 0, 1, 0, 0, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 0, 0, 1, 1, 0, -1, 0, 0, -1, -1, 0, 1, -1, 0, 0, 1, 1, 1, 0, -1, 1, 1, 1, 0, 1, 1, 1, -1, 1, 0, -1, 1

In [488]:
Output = "praveen_chandrasekaran_saikrishnan_thiruvarpuneelakantan_Obama.txt"

def Filetxt(fname, output):
    f=open(fname, "w+")
    for i in range(len(output)):
        f.write(str(df_test_Obama.loc[i,"Tweet_ID"]) + ";;" + str(output[i]) + "\n")

Filetxt(Output,Y_LSTM)
