In [1]:
import pickle
import nltk
import string
import pandas as pd
import regex as re
import tensorflow as tf
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from tensorflow.keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

pd.set_option('display.max_colwidth', 140)

Using TensorFlow backend.


In [2]:
f = open("test_tweet.txt", errors = 'ignore')
input_file = f.read()

input_file[0:500]

'1\t50% off ticket price 4 Ruby Revue Melbourne 2nite. 1st 20 buyers!! Only $12.50! Normally $25pp!!  http://www.therubyrevue.com CODE "STAR"\n1\tVote for your TOP 50 Burlesque Performers!!: VOTE NOW! http://bit.ly/7cBoC9\n1\tPlease follow this link to submit your articles for publishing on the Jac Bowie site  http://bit.ly/5zQ4k9\n1\tHi Guys my Facebook account has been disabled. Please show your support by adding this page :) http://bit.ly/611JQe xx Jac\n1\tIMPORTANT MESSAGE TO JAC BOWIE FRIENDS: Please'

In [3]:
parsedData = input_file.replace('\t', '\n').split('\n')
#parsedData[0:8]

textList = parsedData[1::2]
#textList[0:5]

labelList = parsedData[0::2]
#labelList

In [23]:
tweet_df = pd.DataFrame({'Text': textList, 'Label': labelList})

original_df = tweet_df.copy()

In [5]:
words = words.words()
newWords = ['url','retweet','usermention','hashtag']
words.extend(newWords)
words = set(words)
stop_words = set(stopwords.words("english"))
wn = WordNetLemmatizer()

In [11]:
def clean_data(trial_text):
    for i in trial_text.index:
        trial_text.at[i,"Text"] = re.sub(r"http\S+", "url", trial_text.at[i,"Text"])
        trial_text.at[i,"Text"] = re.sub(r"@\S+", "usermention", trial_text.at[i,"Text"])
    trial_text.drop_duplicates(subset='Text', keep='first', inplace=True)
    for i in trial_text.index:
        trial_text.at[i,"Text"] = re.sub(r"#", "", trial_text.at[i,"Text"])
        trial_text.at[i,"Text"] = re.sub(r"\bRT\b", "retweet", trial_text.at[i,"Text"])
        trial_text.at[i,"Text"] = ' '.join(word for word in wordpunct_tokenize(trial_text.at[i,"Text"]) 
                                           if word.lower() not in stop_words)
        trial_text.at[i,"Text"] = ' '.join(word.strip(string.punctuation) for word in trial_text.at[i,"Text"].split())
        trial_text.at[i,"Text"] = wordpunct_tokenize(trial_text.at[i,"Text"].lower())
        trial_text.at[i,"Text"] = ' '.join([wn.lemmatize(word, pos='v') for word in trial_text.at[i,"Text"]])
        trial_text.at[i,"Text"] = ' '.join(word for word in word_tokenize(trial_text.at[i,"Text"]) if word in words)
        trial_text.at[i,"Text"] = " ".join(word for word in word_tokenize(trial_text.at[i,"Text"]) 
                                           if not (word.isalpha() and len(word)<3))
    #Additional loop for processing glove dataset
#      for i in trial_text.index:
#          trial_text.at[i,"Text"] = re.sub(r"\burl\b", "<url>", trial_text.at[i,"Text"])
#          trial_text.at[i,"Text"] = re.sub(r"\busermention\b", "<user>", trial_text.at[i,"Text"]) 
#          trial_text.at[i,"Text"] = re.sub(r"\bhashtag\b", "<hashtag>", trial_text.at[i,"Text"])
#          trial_text.at[i,"Text"] = re.sub(r"\bretweet\b", "rt", trial_text.at[i,"Text"])
    trial_text['num_words']=trial_text["Text"].str.split().str.len()
    trial_text = trial_text.drop(trial_text[trial_text.num_words < 2].index)
    trial_text = trial_text.drop(['num_words'], axis=1)
    return trial_text

In [24]:
clean_tweet_df = clean_data(tweet_df)
clean_tweet_df

Unnamed: 0,Text,Label
0,ticket price ruby revue normally url code star,1
1,vote top burlesque vote url,1
2,please follow link submit article publish bowie site url,1
3,guy account disable please show support add page url,1
4,important message bowie please help join group support much url,1
5,wow chapel chilly morning chapel url,0
6,head tonight see production something wicked way come wan come,0
7,usermention see eye eye everything wicked smart well mean rare combination,0
8,election day please vote yes prop find poll place sample ballot url,0
9,watch daily show jane inspire woman url,0


In [25]:
#Load the tokenizer
with open('tokenizer_improved_260321.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

print("Number of vocabulary: {}\n".format(len(tokenizer.word_index)))
print(tokenizer.word_index)

Number of vocabulary: 44926



In [27]:
def text_processing(df):
    df_list = list(df.values)
    seq = tokenizer.texts_to_sequences(df_list)
    seq_pad = pad_sequences(seq, maxlen=20, padding='post')
    return seq_pad

In [28]:
test_tweet = text_processing(clean_tweet_df['Text'])
test_tweet[0::4]

array([[  427,   280,  3554, 11061,  3608,     3,   519,   188,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  618,   494,  6055,    64,    63,   257,   396,   323,    55,
            3,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [ 1886,     8,    64,   252,    86,  3126,    73,  1451,   212,
         1639,  6604,     3,     0,     0,     0,     0,     0,     0,
            0,     0]])

In [29]:
tf.random.set_seed(1234)
model = load_model('rnn_improved_260321.hdf5', compile=False)

opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 50)            2246350   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                10624     
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,258,063
Trainable params: 2,258,063
Non-trainable params: 0
_________________________________________________________________


In [30]:
y_pred = (model.predict(test_tweet)>0.5).astype("int32")

original_df['Prediction'] = y_pred
original_df

Unnamed: 0,Text,Label,Prediction
0,"50% off ticket price 4 Ruby Revue Melbourne 2nite. 1st 20 buyers!! Only $12.50! Normally $25pp!! http://www.therubyrevue.com CODE ""STAR""",1,1
1,Vote for your TOP 50 Burlesque Performers!!: VOTE NOW! http://bit.ly/7cBoC9,1,1
2,Please follow this link to submit your articles for publishing on the Jac Bowie site http://bit.ly/5zQ4k9,1,1
3,Hi Guys my Facebook account has been disabled. Please show your support by adding this page :) http://bit.ly/611JQe xx Jac,1,1
4,IMPORTANT MESSAGE TO JAC BOWIE FRIENDS: Please help Jac by joining this group! All of your support is so much... http://bit.ly/5CJ34k,1,1
5,"Wow, the chapel is chilly this morning â€” at LDS Chapel http://gowal.la/s/gEq",0,0
6,â€¦Â heading down to UVU tonight to see their production of Something Wicked this Way Comes! Wanna come? :),0,0
7,@AltF4LJDrama â€¦Â we don't see eye-to-eye on everything. But he's wicked-smart and well-meaning. A rare combination.,0,0
8,"â€¦ It's Election Day! If you're in SLC, please vote YES on Prop 1. Find polling places + sample ballots here: http://elections.utah.gov/",0,0
9,Watching The Daily Show with Jane Goodall. Inspiring woman. http://bit.ly/1FYrXq,0,1
