## Data Cleaning and Preprocessing

In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.models import Sequential, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dense, Dropout, Convolution1D, MaxPooling1D, SpatialDropout1D, Input 
from keras.layers import GlobalMaxPooling1D, concatenate, LSTM, Bidirectional
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy.stats as stats
import string
import re
from sklearn.base import BaseEstimator, TransformerMixin

def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

Using TensorFlow backend.


In [2]:
# Text cleaner found on github: https://github.com/martinpella/twitter-airlines/blob/master/utils.py
class TextCleaner(BaseEstimator, TransformerMixin):    
    def remove_mentions(self, text):        
        return re.sub(r'@\w+', '', text)
    
    def remove_urls(self, text):        
        return re.sub(r'http.?://[^\s]+[\s]?', '', text)
    
    def only_characters(self, text):
        return re.sub('[^a-zA-Z\s]', '', text)
    
    def remove_extra_spaces(self, text):
        text = re.sub("\s+", ' ', text)
        text = text.lstrip()
        return text.rstrip()
    
    def to_lower(self, text):
        return text.lower()
    
    def fix_words(self, text):
        text = re.sub(r'\bthx\b', 'thanks', text)
        text = re.sub(r'\bu\b', 'you', text)
        text = re.sub(r'\bhrs\b', 'hours', text)
        text = re.sub(r'\baa\b', 'a', text)
        text = re.sub(r'\bflightr\b', 'flight', text)
        text = re.sub(r'\bur\b', 'your', text)
        text = re.sub(r'\bhr\b', 'hour', text)
        text = re.sub(r'\bthru\b', 'through', text)
        text = re.sub(r'\br\b', 'are', text)
        text = re.sub(r'\bppl\b', 'people', text)
        text = re.sub(r'\btix\b', 'fix', text)
        text = re.sub(r'\bplz\b', 'please', text)
        text = re.sub(r'\bflightd\b', 'flighted', text)
        text = re.sub(r'\btmrw\b', 'tomorrow', text)
        text = re.sub(r'\bthx\b', 'thanks', text)
        text = re.sub(r'\bpls\b', 'please', text)
        text = re.sub(r'\bfyi\b', 'for your information', text)
        
        text = re.sub(r'\bheyyyy\b', 'hey', text)
        text = re.sub(r'\bguyyyys\b', 'guys', text)
        text = re.sub(r'\byall\b', 'you all', text)
        text = re.sub(r'\basap\b', 'as soon as possible', text)
        text = re.sub(r'\bbtw\b', 'by the way', text)
        text = re.sub(r'\bdm\b', 'direct message', text)
        text = re.sub(r'\bcudtomers\b', 'customers', text)
        text = re.sub(r'\bwtf\b', 'what the fuck', text)
        text = re.sub(r'\biphone\b', 'phone', text)
        text = re.sub(r'\bmins\b', 'minutes', text)
        text = re.sub(r'\btv\b', 'television', text)
        text = re.sub(r'\bokay\b', 'ok', text)
        text = re.sub(r'\bfeb\b', 'february', text)
        text = re.sub(r'\byr\b', 'year', text)
        text = re.sub(r'\bshes\b', 'she is', text)
        text = re.sub(r'\bnope\b', 'no', text)
        text = re.sub(r'\bhes\b', 'he is', text)
        text = re.sub(r'\btill\b', 'until', text)
        text = re.sub(r'\bomg\b', 'oh my god', text)
        text = re.sub(r'\btho\b', 'though', text)
        text = re.sub(r'\bnothappy\b', 'not happy', text)
        return re.sub(r'\bthankyou\b', 'thank you', text)
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):        
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.only_characters).apply(self.remove_extra_spaces).apply(self.to_lower).apply(self.fix_words)
        return clean_X

In [3]:
# load the data
relevant_tweets = pd.read_hdf('datasets/relevant_tweets.h5', 'relevant_tweets')
relevant_tweets['classification'] = 'relevant'
relevant_tweets['binary_class'] = np.ones(len(relevant_tweets)).astype(int)
relevant_tweets = relevant_tweets[['classification', 'binary_class', 'text']]
irrelevant_tweets = pd.read_hdf('datasets/not_relevant_tweets.h5', 'not_relevant_tweets')
irrelevant_tweets['classification'] = 'irrelevant'
irrelevant_tweets['binary_class'] = np.zeros(len(irrelevant_tweets)).astype(int)
irrelevant_tweets = irrelevant_tweets[['classification', 'binary_class', 'text']]
df = pd.concat([relevant_tweets, irrelevant_tweets]).reset_index()
df = df.iloc[:, 1:]
df.head()

Unnamed: 0,classification,binary_class,text
0,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B..."
1,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep..."
2,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep..."
3,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B..."
4,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B..."


In [4]:
# Clean the text
tc = TextCleaner()
df['cleaned_text'] = tc.transform(df['text'])
df.head()

Unnamed: 0,classification,binary_class,text,cleaned_text
0,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...
1,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep...",many us beaches are unsafe for swimming report...
2,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep...",many us beaches are unsafe for swimming report...
3,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...
4,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...


In [5]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

df['tokenized'] = df['cleaned_text'].apply(lambda row: tokenize(row))
df.head()

Unnamed: 0,classification,binary_class,text,cleaned_text,tokenized
0,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...,"[sun, sand, and, sewage, report, shows, many, ..."
1,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep...",many us beaches are unsafe for swimming report...,"[many, us, beaches, are, unsafe, for, swimming..."
2,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep...",many us beaches are unsafe for swimming report...,"[many, us, beaches, are, unsafe, for, swimming..."
3,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...,"[sun, sand, and, sewage, report, shows, many, ..."
4,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...,"[sun, sand, and, sewage, report, shows, many, ..."


In [6]:
df = df.drop_duplicates(subset = ['cleaned_text']).reset_index()
df = df.iloc[:, 1:]
print(len(df))
df.head()

62688


Unnamed: 0,classification,binary_class,text,cleaned_text,tokenized
0,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...,"[sun, sand, and, sewage, report, shows, many, ..."
1,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep...",many us beaches are unsafe for swimming report...,"[many, us, beaches, are, unsafe, for, swimming..."
2,relevant,1,"Many U.S. Beaches Are Unsafe For Swimming, Rep...",many us beaches are unsafe for swimming report...,"[many, us, beaches, are, unsafe, for, swimming..."
3,relevant,1,"Sun, Sand And Sewage: Report Shows Many U.S. B...",sun sand and sewage report shows many us beach...,"[sun, sand, and, sewage, report, shows, many, ..."
4,relevant,1,"Thanks, EPA.\n\nSun, Sand And Sewage: Report S...",thanks epa sun sand and sewage report shows ma...,"[thanks, epa, sun, sand, and, sewage, report, ..."


In [7]:
from nltk.corpus import stopwords

In [16]:
import collections
stop = set(stopwords.words('english'))
stop.update(['amp', 'rt', 'cc'])
stop = stop - set(['no', 'not'])

def remove_stopwords(row):
    return [t for t in row if t not in stop]

df['tokenized'] = df['tokenized'].apply(lambda row: remove_stopwords(row))

df = df[['classification', 'binary_class', 'text', 'tokenized']]
vocab_counter = collections.Counter()

In [35]:
tokens = np.array(df['tokenized']).reshape(-1, 1)
words = []
for i in range(len(tokens)):
    tweet = tokens[i]
    for j in range(len(tweet)):
        new_word = tweet[j]
        words.append(new_word)

print(type(words))
indivs = []
for tweet in range(len(words)):
    t = words[tweet]
    for j in range(len(t)):
        indivs.append(t[j])
indivs[:2]

<class 'list'>


['sun', 'sand']

In [38]:
vocabulary_size = 50000
from collections import Counter

# UNK = unknown words, HST = hashtag, EMT = emoticon, URL is self-explanatory, THDL = Twitter handle
vocabulary = [("<UNK>", None)] + [("<THDL>", None)] +\
            Counter(indivs).most_common(vocabulary_size - 1)
vocabulary = np.array([word for word, _ in vocabulary])
dictionary = {word: code for code, word in enumerate(vocabulary)}
data = np.array([dictionary.get(word, 0) for word in indivs])
print(len(vocabulary))
#print(len(data))

40002


In [48]:
lengths = df['tokenized'].apply(lambda x: len(x))
print(max(lengths), min(lengths), np.mean(lengths))

53 0 13.877185426237876


In [49]:
# set the max_length
max_length = 50

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['tokenized'].values, df['binary_class'].values,
                                                   test_size = 0.2, random_state = 42)

KeyError: 'token_int'

In [None]:
# padding our sequences
X_train = pad_sequences(X_train, maxlen = max_length, value = -1)
X_test = pad_sequences(X_test, maxlen = max_length, value = -1)

## Working CNN

In [None]:
# creating the embedding matrix
def embed_matrix(vocab_size, embedding_dimension):
    embedding_matrix = np.zeros((vocab_size, embedding_dimension))
    found = 0
    for word, i in word_to_id.items():
        embedding_vector = .get(word)
        if i < vocab_size:
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                found += 1
            else:
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix, found

embedding_matrix, found = create_emb_matrix(vocab_size, 128)

print(found)

# take different filter sizes to word vectors, then concatenate the outputs and apply a classifier on top of that
def multilayer_cnn_model():
    graph_input = Input(shape = (vocab_size, ))
    
    model = Sequential([Embedding(input_dim = vocab_size, output_dim = 128, input_len = max_length),
        Convolution1D()
    ])