###  Irony detection in tweets

```csv
Tweet index     Label   Tweet text
1       1       Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  http://t.co/fej2v3OUBR
2       1       @mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)
3       1       Hey there! Nice to see you Minnesota/ND Winter Weather 
4       0       3 episodes left I'm dying over here
```


In [0]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from collections import Counter
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from array import array
from keras import layers
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense, Bidirectional, Dropout


In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
filepath = 'SemEval2018-T3-train-taskA.txt'

In [0]:
all_data = []
#All the tweets, index and labels are read from the file and stored in a dataframe
def load_tweets(preserve_case = False):
    file =open(filepath, encoding="UTF-8")
    lines=  file.readlines()
    del lines[0]
    dict = {"Tweet index": [], "Label": [], "Tweet text": []}
    for line in lines:
      values = line.split('\t')
      label = int(values[1])
      dict["Tweet index"].append(values[0])
      dict["Label"].append(label)
      tweet = str.lower(values[2]) if preserve_case is False else values[2]
      dict["Tweet text"].append(tweet)
      values[0] = int(values[0])
      values[1] = label
      values[2] = word_tokenize(tweet)
      all_data.append(tuple(values))
    return pd.DataFrame(dict)

In [0]:
formatted_data = load_tweets()
formatted_data[0:5]

Unnamed: 0,Label,Tweet index,Tweet text
0,1,1,sweet united nations video. just in time for c...
1,1,2,@mrdahl87 we are rumored to have talked to erv...
2,1,3,hey there! nice to see you minnesota/nd winter...
3,0,4,3 episodes left i'm dying over here\n
4,1,5,"""i can't breathe!"" was chosen as the most nota..."


In [0]:
# format used to train the model in task 4
all_data[:3]

[(1,
  1,
  ['sweet',
   'united',
   'nations',
   'video',
   '.',
   'just',
   'in',
   'time',
   'for',
   'christmas',
   '.',
   '#',
   'imagine',
   '#',
   'noreligion',
   'http',
   ':',
   '//t.co/fej2v3oubr']),
 (2,
  1,
  ['@',
   'mrdahl87',
   'we',
   'are',
   'rumored',
   'to',
   'have',
   'talked',
   'to',
   'erv',
   "'s",
   'agent',
   '...',
   'and',
   'the',
   'angels',
   'asked',
   'about',
   'ed',
   'escobar',
   '...',
   'that',
   "'s",
   'hardly',
   'nothing',
   ';',
   ')']),
 (3,
  1,
  ['hey',
   'there',
   '!',
   'nice',
   'to',
   'see',
   'you',
   'minnesota/nd',
   'winter',
   'weather'])]

In [0]:
ironic_tweets = formatted_data.loc[formatted_data['Label'] == 1]
not_ironic_tweets = formatted_data.loc[formatted_data['Label'] == 0]

In [0]:
len(not_ironic_tweets)

1923

In [0]:
len(ironic_tweets)

1911

In [0]:
all_tweets = formatted_data['Tweet text'].str.cat(sep=' ')
tokenized_words = word_tokenize(all_tweets)
total_vocabulary = set(tokenized_words)
len(total_vocabulary)

13443

In [0]:
def bag_of_words():
  vectorizer = CountVectorizer()
  X = vectorizer.fit_transform(formatted_data['Tweet text'])
  return X

In [0]:
def bag_of_words_alternate():
  X = []
  for sentence in formatted_data['Tweet text']:
    x_sent = []
    sent_words = word_tokenize(sentence.lower())
    frequencies_words = Counter(sent_words)
    for voc in vocabulary:
      if voc in sent_words:
        x_sent.append(frequencies_words[voc])
      else:
        x_sent.append(0)
      X.append(x_sent)
  return X      

In [0]:
def bad_of_words_using_tf_df():
  tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True).tokenize
  vectorizer = TfidfVectorizer(strip_accents="unicode", analyzer="word", tokenizer=tokenizer)
  return vectorizer.fit_transform(formatted_data['Tweet text'])

##Naive Bayes Classifier
Naive Bayes is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature

It is easy and fast to predict class of test data set. It also perform well in multi class prediction

When assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data.


---


. 

In [0]:
class NaiveBayes(object):

    '''
    Naive Bayes classifier - 
    Reference used: Lecture notes CT475 machine learning and data mining
    '''
    def train(self, train_data):
        '''
        input: train_data (data frame containing column Index and Label)
        This method is used to calculate the prior probabilites and also voabulary count of tweets for each sentiment
        '''
        
        self.data = train_data
        ironic_train, non_ironic_train = self.separate_tweets()
        tweet_string = self.data['Tweet text'].str.cat(sep=' ')
        tokenized_words = word_tokenize(tweet_string.lower())
        self.vocabulary_length = len(set(tokenized_words))
        self.ironic_prior_probability = (len(ironic_train)+1)/(len(ironic_train)+len(non_ironic_train)+2)
        self.non_ironic_prior_probability = (len(non_ironic_train)+1)/(len(ironic_train)+len(non_ironic_train)+2)
        self.ironic_frequency = self.bag_of_words(ironic_train)
        self.non_ironic_frequency = self.bag_of_words(non_ironic_train)
        self.ironic_vocab_count = self.ironic_frequency.values.sum()
        self.non_ironic_vocab_count = self.non_ironic_frequency.values.sum()
                
    def separate_tweets(self):
        ironic_tweets = self.data.loc[self.data['Label'] == 1]
        not_ironic_tweets = self.data.loc[self.data['Label'] == 0]
        return ironic_tweets,not_ironic_tweets

    def bag_of_words(self, data):
        '''
        input: data [same format as the train data]
        this method is used to compute the frequency distribution of the tokens  in the tweets
        '''
        vectorizer = CountVectorizer()
        X_array = vectorizer.fit_transform(data['Tweet text']).toarray()
        frequency_matrix = pd.DataFrame(X_array,columns = vectorizer.get_feature_names())
        return frequency_matrix
    
    def predict(self, test_data):
        '''
        input: test data (list of tweets [not tokenized] which have to be classified)
        This method is used to calculate the labels based on the training data that was provided to the model
        '''
        y = []
        for sentence in test_data:
            ironic_prob = self.ironic_prior_probability
            non_ironic_prob = self.non_ironic_prior_probability
            for word in word_tokenize(sentence):
                word_ironic_freq = self.ironic_frequency[word].sum() +1 if word in self.ironic_frequency else 1            
                word_non_ironic_freq = self.non_ironic_frequency[word].sum()+ 1 if word in self.non_ironic_frequency else 1
                word_ironic_prob = word_ironic_freq/(self.ironic_vocab_count + self.vocabulary_length)
                word_non_ironic_prob = word_non_ironic_freq/(self.non_ironic_vocab_count + self.vocabulary_length)
                ironic_prob = ironic_prob * word_ironic_prob
                non_ironic_prob = non_ironic_prob * word_non_ironic_prob
                ironic_prob = ironic_prob/(ironic_prob+ non_ironic_prob)
                non_ironic_prob = 1- ironic_prob
                prediction = 1 if (ironic_prob > non_ironic_prob) else 0
            y.append(prediction)
        return y

## Test-Train Split

There are 3834 tweets in the input file provided. Using the first 2500 tweets which is close to 65% of the given data as training data and the remaining data to be used as the hold out set or the test data. If there is more training data that is provided to the model ,the better results can be expected .However more training data could out also lead to overfitting. Looking at the first 2500 tweets in the sample the distribution of the positive and the negative labelled tweets seems to be nearly equal. (Distribution is not skewed)

In [0]:
#check the length of ironic tweeets in the train dataset
ironic_tweets = formatted_data[0:2500].loc[formatted_data['Label'] == 1]
not_ironic_tweets = formatted_data[0:2500].loc[formatted_data['Label'] == 0]

In [0]:
len(ironic_tweets)

1253

In [0]:
def task3():
  return all_data[0:2500],all_data[2500:]

## Evaluation Metric

### F1 score 
F1 score - F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account. If you want to know if your predictions are good, you need these two measures. You can have a precision of 1 (so when you say it's positive, it's actutally positive) but still have a very low recall (you predicted 3 good positives but forgot 15 others). Or you can have a good recall and a bad precision.

F1 Score = 2**(Recall) *** Precision) / (Recall + Precision)

In [0]:
model = NaiveBayes()
model.train(formatted_data[0:2500])

In [0]:
test_data, test_labels = formatted_data[2500:]['Tweet text'], formatted_data[2500:]['Label']

In [0]:
y_pred = model.predict(test_data)
print(len(y_pred))

1334


In [0]:
from sklearn.metrics import f1_score
f1_score = f1_score(test_labels, y_pred, average='micro')
print(f1_score)

0.6221889055472264


In [0]:



from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels,y_pred)
print(accuracy)


0.6221889055472264


In [0]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_labels, y_pred,labels=[0, 1])
print(cm)

[[340 336]
 [168 490]]


# Task 4 (20 Marks)

Run the following code to generate a model from your training set. The training set should be in a variable  called `train` and is assumed to be of the form:

```
[(1, 1, ['sweet', 'united', 'nations', 'video', '.', 'just', 'in', 'time', 'for', 'christmas', '.', '#', 'imagine', '#', 'noreligion', 'http', ':', '//t.co/fej2v3oubr']), 
 (2, 1, ['@', 'mrdahl87', 'we', 'are', 'rumored', 'to', 'have', 'talked', 'to', 'erv', "'s", 'agent', '...', 'and', 'the', 'angels', 'asked', 'about', 'ed', 'escobar', '...', 'that', "'s", 'hardly', 'nothing', ';', ')']), 
 (3, 1, ['hey', 'there', '!', 'nice', 'to', 'see', 'you', 'minnesota/nd', 'winter', 'weather']), 
 (4, 0, ['3', 'episodes', 'left', 'i', "'m", 'dying', 'over', 'here']), 
 ...
]
 ```



In [0]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM, Bidirectional
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.regularizers import L1L2
import numpy as np

## These values should be set from Task 3
train, test = task3()

def make_dictionary(train, test):
    dictionary = {}
    for d in train+test:
        for w in d[2]:
            if w not in dictionary:
                dictionary[w] = len(dictionary)
    return dictionary

class KerasBatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        self.current_idx = 0
        self.current_sent = 0
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, 2))
        while True:
            for i in range(self.batch_size):
                # Choose a sentence and position with at lest num_steps more words
                while self.current_idx + self.num_steps >= len(self.data[self.current_sent][2]):
                    self.current_idx = self.current_idx % len(self.data[self.current_sent][2])
                    self.current_sent += 1
                    if self.current_sent >= len(self.data):
                        self.current_sent = 0
                # The rows of x are set to values like [1,2,3,4,5]
                x[i, :] = [self.vocabulary[w] for w in self.data[self.current_sent][2][self.current_idx:self.current_idx + self.num_steps]]
                # The rows of y are set to values like [[1,0],[1,0],[1,0],[1,0],[1,0]]
                
                y[i, :, :] = [[self.data[self.current_sent][1], 1-self.data[self.current_sent][1]]] * self.num_steps
                self.current_idx += self.skip_step
            yield x, y

            


# Hyperparameters for model
vocabulary = make_dictionary(train, test)
num_steps = 5
batch_size = 25
num_epochs = 75 # Reduce this if the model is taking too long to train (or increase for performance)
hidden_size = 125 # Increase this to improve perfomance (or increase for performance)
use_dropout=True

# Create batches for RNN
train_data_generator = KerasBatchGenerator(train, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(test, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)

# A double stacked LSTM with dropout and n hidden layers
model = Sequential()
model.add(Embedding(len(vocabulary), hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add (LSTM (hidden_size , bias_regularizer=L1L2(l1=0.01, l2=0.01), return_sequences = True ))
#model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.3))
model.add(TimeDistributed(Dense(2)))
model.add(Activation('softmax'))

# Set optimizer and build model
optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

# Train the model
model.fit_generator(train_data_generator.generate(), len(train)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(test)//(batch_size*num_steps))

# Save the model
model.save("final_model.hdf5")

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75


Now consider the following code:

In [0]:
model = load_model("final_model.hdf5")

x = np.zeros((1,num_steps))
x[0,:] = [vocabulary["this"],vocabulary["is"],vocabulary["an"],vocabulary["easy"],vocabulary["test"]]
print((model.predict_proba(x)))
print((model.predict_classes(x)))


[[[0.47148433 0.52851564]
  [0.47734493 0.522655  ]
  [0.57648987 0.42351013]
  [0.9694578  0.03054217]
  [0.9968754  0.00312455]]]
[[1 1 0 0 0]]


## Prediction Method Used:
Since the model used in above step takes in num_steps (5) tokens as inputs at a time and gives the ironic probability of each token at a time, Each tweet is broken down into a 5 grams [5 word sequences] at a time and is fed to the model to get the probability of each word being ironic or not . This allows the word to be considered in different contexts to check whether the given tweet is ironic or not.

   Though the model during training ignores batches which are less than num_steps [last 2 tokens in a 12 word sentence], the method that i have used considers all the tokens in a given tweet, so that there is no context information that is lost , and most of the hashtags are present in a tweet at the end.


---


For Example:
['Tripped', 'over', 'my', 'own', 'feet', 'three', 'times', 'in', 'the', 'hall']

For the above tweet: The strategy that i have used to check whether the tweet is ironic or not is by considering 5 word sequences at a time and multiply the probability of each set to check the ironic probability.


---


[['Tripped', 'over', 'my', 'own', 'feet',] x  [ 'over', 'my', 'own', 'feet', ''three'' ]x  [ 'my',  'own',  'feet', 'three',' times'] x  ['own', 'feet', 'three', 'times', 'in'] x [''feet', 'three', 'times', 'in', 'the'] x [ 'three', 'times', 'in', 'the', 'hall']


---



In [0]:
def predict_keras(test_data):
  '''
  input: test_data [dataframe with tweets in the second index just like the train data]
  output: classification outputs in a list
  '''
  current_idx = 0
  predictions = []
  for i in range(len(test_data)):
    current_idx = 0
    num_tokens = len(test_data[i][2])
    #identify the nunmber of batches of num_steps can be formed
    batch_size = num_tokens - num_steps +1 if (num_tokens - num_steps >0) else 1
    #inistialize each batch with zeroes
    x = np.zeros((batch_size,num_steps))
    while True:
      #identify the number of steps to be considered based on the number of tokens present in the tweet
      step_size = num_steps if (num_tokens> num_steps) else num_tokens
      if (num_tokens> num_steps):
        x[current_idx, :] = [vocabulary[w] for w in test_data[i][2][current_idx:current_idx + num_steps]]
      else:
        x[current_idx, :num_tokens] = [vocabulary[w] for w in test_data[i][2][current_idx:current_idx + num_tokens]]        
      current_idx +=1
      if (current_idx==batch_size):
        break
    predict_sentence = model.predict_proba(x)
    # multiply the ironic probabilites for each set
    ironic_prob = np.prod([s[0] for c in predict_sentence for s in c])
    # multiply the non ironic probabilites for each set
    non_ironic_prob = np.prod([s[1] for c in predict_sentence for s in c])
    predictions.append(1 if ironic_prob > non_ironic_prob else 0)
  return predictions
  

In [0]:
predictions = predict_keras(test[:])

In [0]:
actual = [y[1] for y in test[:]]


In [0]:
from sklearn.metrics import f1_score,accuracy_score
f1score = f1_score(actual, predictions, average='micro')
print(f1score)


0.5622188905547226


In [0]:
accuracy = accuracy_score(actual,predictions)
print(accuracy)

0.5622188905547226


In [0]:
cm = confusion_matrix(actual, predictions)
print(cm)

[[525 151]
 [433 225]]


## Improving the above model

## Preprocessing Steps


1.   All the URL's (image links and other website links present in the tweets) which dont help in anyway in detecting irony in a tweet are removed
2.   The users which are tagges in a tweet are all replaced with @user in a tweet as the username does not help in any way in identifying whether a tweet is ironic or not.
3. The hashtags which i consider to be the main features in identifying whether a tweet is ironic or not, have been added back again to the tweet if they are in camel case format like #NewYearNewMe is split into #New #Year #New #Me, so that each token can be identified separately.



In [0]:
#load all the tweets preserving the case this time
formatted_data = load_tweets(preserve_case=True)

In [0]:
import re

# preprocess the tweets according to the rules mentioned above
def preprocess_sentences(sentences):
  processed_sentences = []
  hashtagCamelCase = re.compile(r'[A-Z]{2,}(?![a-z])|[A-Z][a-z]+')
  for line in sentences:
    #Replace URLs by <website>:
    line = re.sub(r"https?://.+","",line)
    #Replace usernames by <user>:
    line = re.sub(r"@[^\s]*","@user",line)
    hashtags =re.findall(r'#\w+', line)
    for hashtag in hashtags:      
      words = hashtagCamelCase.findall(hashtag)  
      for var in words:
        line += " #"+var
    processed_sentences.append(str.lower(line))
  return processed_sentences

In [0]:
from sklearn.model_selection import train_test_split

sentences = formatted_data['Tweet text'].values

In [0]:
sentences = preprocess_sentences(sentences=sentences)

In [0]:
# get all the labels from the tweet
y = formatted_data['Label'].values
#get the test and train features and labels 
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

#Approach 1
##Word Embeddings
Word embeddings are in fact a class of techniques where individual words are represented as real-valued vectors in a predefined vector space. Each word is mapped to one vector and the vector values are learned in a way that resembles a neural network.


The model used in Task 4 had an embedding layer learning embeddings based on the vocabulary of the dataset. Since the vector space can not be restricted just based on the limited voabulary of the given dataset,  to improve the  performance of the model, an alternative  precomputed embedding space GloVe  is used.
###GloVe
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

## Bidirectional LSTM
Bidirectional LSTMs are an extension of traditional LSTMs that can improve model performance on sequence classification problems.
In problems where all timesteps of the input sequence are available, Bidirectional LSTMs train two instead of one LSTMs on the input sequence. The first on the input sequence as-is and the second on a reversed copy of the input sequence. This can provide additional context to the network and result in faster and even fuller learning on the problem.

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 35
hidden_size =100
#To tokenize the data into a format that can be used by the word embeddings.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
# since each tweet text is not of the same length the remaining length is padded
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

In [0]:
# download the glove embedding file and extract the files
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [0]:
import numpy as  np
'''
Reference: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py
Only those words which are present in the vocabulary are checked
Each word in the file is followed by its vector as a stream of floats
'''

def get_weight_matrix(filepath, word_index):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    weights = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word_index:
                idx = word_index[word] 
                weights[idx] = np.array(
                    values[1:], dtype=np.float32)[:embedding_dim]

    return weights

In [0]:
embedding_dim = 100

glove_path = 'glove.6B.100d.txt'
weight_matrix = get_weight_matrix(glove_path,tokenizer.word_index)

In [0]:

hidden_size = 100
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, input_length=35,trainable=True, weights=[weight_matrix]))
model.add((LSTM(hidden_size, return_sequences=True, recurrent_dropout=0.2)))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True, recurrent_dropout=0.3)))
model.add(Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 35, 100)           833000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 35, 100)           80400     
_________________________________________________________________
dropout_6 (Dropout)          (None, 35, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 35, 200)           160800    
_________________________________________________________________
dropout_7 (Dropout)          (None, 35, 200)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 7000)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 10)                70010     
__________

In [0]:
model.fit(X_train, y_train,epochs=60,verbose=True,validation_data=(X_test, y_test),batch_size=30)

Train on 2875 samples, validate on 959 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x7f9a90fdb7f0>

In [0]:
predicted = model.predict_classes(X_test)
predicted = predicted.reshape(len(predicted))

In [0]:
accuracy = accuracy_score(y_test, predicted)
print("Accuracy of the model is " +str(accuracy))
cm = confusion_matrix(y_test, predicted,labels=[0, 1])
print("Confusion Matrix")
print(cm)

Accuracy of the model is 0.5891553701772679
Confusion Matrix
[[294 174]
 [220 271]]


In [0]:
f1score = f1_score(y_test, predicted, average='micro')
print("F1 score  of the model is " +str(f1score))

F1 score  of the model is 0.5891553701772679


# Approach 2

##Tf-Idf
30% of the tokens present in the dataset are not present in the GLove because the way in which the tweets are normally present. With a lot of misspelt words and slang languages. Another approach to vectorize each sentence is by using Tf-Idf approach. Where Tf represents the term frequency of each token, and Idf stands for Inverse document Frequency which is used to penalize words which appear across multiple tweets [Documents] and helps very little in distinguishing one document from another.

Both Unigrams and Bigrams of words in the tweet are considered in training the model

In [0]:
tweet_tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True).tokenize
vectorizer = TfidfVectorizer(strip_accents="unicode", analyzer="word", tokenizer=tweet_tokenizer,ngram_range=(1,2))
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
input_dim = X_train.shape[1]  

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train,epochs=60,verbose=True,validation_data=(X_test, y_test),batch_size=30)

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
predicted = model.predict_classes(X_test)
predicted = predicted.reshape(len(predicted))
f1score = f1_score(y_test, predicted, average='micro')
print("F1 score  of the model is " +str(f1score))

Train on 2875 samples, validate on 959 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
Testing Accuracy:  0.6257
F1 score  of the model is 0.6256517205422315
