# Sarcasm Detection Baseline with Naive Bayes Classification

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
vocab_file = 'glove_vocab.txt'

vocab = set()
with open(vocab_file) as f:
  for line in f:
    if line in vocab:
      print(line, 'is repeated')
    vocab.add(line.strip())

print(len(vocab))
## Add in UNK to the vocab
vocab.add('[UNK]')

## Check if contractions are in the vocab, if not add
contractions = ["n't", "'ll", "'re", "'s"]
for word in contractions:
  if word not in vocab:
    print('Adding', word)
    vocab.add(word)

400000


In [8]:
def normal_run(k, ngram, X_train, Y_train):
    '''
    Run a Multinomial NB classifier with Laplace smoothing, using normal frequency count.
    Parameters:
        - k (int): the k value for addk smoothing
        - ngram (tuple of int, int): length of n-gram word window
    '''
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    cv = CountVectorizer(stop_words='english', ngram_range = ngram, tokenizer = token.tokenize, vocabulary=vocab)
    text_counts = cv.fit_transform(X_train)      
    X_train, X_test, Y_train, Y_test = train_test_split(text_counts, Y_train,test_size=0.1, random_state=5)
    MNB = MultinomialNB(alpha=k)
    MNB.fit(X_train, Y_train)
    score(MNB, X_test, Y_test)
    return MNB


In [9]:
def tfidf_run(k, ngram, X_train, Y_train):
    '''
    Run a Multinomial NB classifier with Laplace smoothing, using TF-IDF.
    Parameters:
        - k (int): the k value for addk smoothing
        - ngram (tuple of int, int): length of n-gram word window
    '''
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    tfidf = TfidfVectorizer(stop_words='english', tokenizer = token.tokenize, vocabulary=vocab)
    text_counts = tfidf.fit_transform( X_train)     
    X_train, X_test, Y_train, Y_test = train_test_split(text_counts, Y_train,test_size=0.1, random_state=5)
    MNB = MultinomialNB(alpha=k)      #defining the model
    MNB.fit(X_train, Y_train)         #compilimg the model
    score(MNB, X_test, Y_test)                        # evaluate with accuracy and F-1 score
    return MNB

In [64]:
def score(model, X_test, Y_test):
    '''
    Report accuracy and f1-score.
    '''
    predicted = model.predict(X_test)
    print(predicted)
    accuracy_score = metrics.accuracy_score(predicted, Y_test)
    print(str('Accuracy: {:04.2f}'.format(accuracy_score)))
    precision_score = metrics.precision_score(predicted, Y_test)
    print(str('Precision: {:04.2f}'.format(precision_score)))
    recall_score = metrics.recall_score(predicted, Y_test)
    print(str('Recall: {:04.2f}'.format(recall_score)))
    f1_score = metrics.f1_score(predicted, Y_test)
    print(str('F-1 score: {:04.2f}'.format(f1_score)))
    

In [33]:
sarcasm_df = pd.read_json("sarcasm.json", lines=True)
sarcasm_df = sarcasm_df.drop(['article_link'], axis=1)
sarcasm_X = list(sarcasm_df.headline)
sarcasm_Y = (sarcasm_df.is_sarcastic)
sarcasm_test_sent = sarcasm_X[int(0.9*len(sarcasm_X)):]
sarcasm_X_train = sarcasm_X[:int(0.9*len(sarcasm_X))]
sarcasm_test_labels = sarcasm_Y[int(0.9*len(sarcasm_Y)):]
sarcasm_Y_train = sarcasm_Y[:int(0.9*len(sarcasm_X))]

In [42]:
# TF Baseline for Headlines: Hyperparameter tuning
m1 = normal_run(10, (3,5), sarcasm_X_train, sarcasm_Y_train)



Accuracy: 0.50
Precision: 0.00
Recall: 0.00
F-1 score: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# Testing on Test set
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english', ngram_range = (5,5), tokenizer = token.tokenize, vocabulary=vocab)
text_counts = cv.fit_transform(sarcasm_test_sent)      
score(m1, text_counts, sarcasm_test_labels)

Accuracy: 0.53
Precision: 0.00
Recall: 0.00
F-1 score: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
M2 = tfidf_run(10, (5,5),sarcasm_X_train, sarcasm_Y_train)



Accuracy: 0.76
Precision: 0.61
Recall: 0.87
F-1 score: 0.72


In [53]:
# Testing on Headline test set
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = TfidfVectorizer(stop_words='english', ngram_range = (5,5), tokenizer = token.tokenize, vocabulary=vocab)
text_counts = cv.fit_transform(sarcasm_test_sent)      
score(M2, text_counts, sarcasm_test_labels)



Accuracy: 0.76
Precision: 0.59
Recall: 0.86
F-1 score: 0.70


In [70]:
df = pd.read_csv('clean_reddit.csv')
reddit_sent = df.sent.values
reddit_label = df.label.values
df.info()
df.label.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331 entries, 0 to 1330
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1331 non-null   int64 
 1   sent    1331 non-null   object
dtypes: int64(1), object(1)
memory usage: 20.9+ KB


1    680
0    651
Name: label, dtype: int64

In [66]:
# Testing TF on Reddit set
cv = CountVectorizer(stop_words='english', ngram_range = (1,5), tokenizer = token.tokenize, vocabulary=vocab, min_df=0.02)
text_counts = cv.fit_transform(reddit_sent)      
score(m1, text_counts, reddit_label)

# Testing IDF on Reddit set
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = TfidfVectorizer(stop_words='english', ngram_range = (5,5), tokenizer = token.tokenize, vocabulary=vocab)
text_counts = cv.fit_transform(reddit_sent)      
score(M2, text_counts, reddit_label)



[0 0 0 ... 0 0 0]
Accuracy: 0.49
Precision: 0.00
Recall: 0.00
F-1 score: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


[0 0 0 ... 0 0 0]
Accuracy: 0.49
Precision: 0.00
Recall: 0.00
F-1 score: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
df = pd.read_csv('clean_tweet.csv')
tweet_sent = df.sent.values
tweet_label = df.label.values
df.info()
df.label.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1446 entries, 0 to 1445
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1446 non-null   int64 
 1   sent    1446 non-null   object
dtypes: int64(1), object(1)
memory usage: 22.7+ KB


0    754
1    692
Name: label, dtype: int64

In [73]:
# Testing TF on twitter set
cv = CountVectorizer(stop_words='english', ngram_range = (2,5), tokenizer = token.tokenize, vocabulary=vocab)
text_counts = cv.fit_transform(tweet_sent)      
score(m1, text_counts, tweet_label)

# Testing IDF on twitter set
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = TfidfVectorizer(stop_words='english', ngram_range = (1,5), tokenizer = token.tokenize, vocabulary=vocab)
text_counts = cv.fit_transform(tweet_sent)      
score(M2, text_counts, tweet_label)

  _warn_prf(average, modifier, msg_start, len(result))


[0 0 0 ... 0 0 0]
Accuracy: 0.52
Precision: 0.00
Recall: 0.00
F-1 score: 0.00
[0 1 0 ... 1 0 0]
Accuracy: 0.55
Precision: 0.31
Recall: 0.55
F-1 score: 0.40




Now we have the training and testing data. We should start the analysis. Our analysis (as most of ML analysis) will be in 5 steps(a mneumonic to remember them is <b>DC-FEM</b> remember as DC Female or District of Columbia Fire and Emergency Medical service): 
<ol>
    <li>Defining the model</li>
    <li>Compiling the model</li>
    <li>Fitting the model</li>
    <li>Evaluating the model</li>
    <li>Making predictions with the model</li>
</ol>
 
### 1. Defining the model
We will use one of the __[Naive Bayes (NB)](https://scikit-learn.org/stable/modules/naive_bayes.html)__ classifier for defining the model. Specifically we will use __[MultinomialNB classifier](https://scikit-learn.org/stable/modules/naive_bayes.html)__. As a fresher to ML one can use cheat sheet given by sklearn __[here](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)__ to determine the best model to use for a particular problem. It tell us to use NB classifier. Let us take a detour to learn more about NB model. 
####  Naive Bayes Model
This model applies Bayes theorem with a Naive assumption of no relation between different features. According to Bayes theorem:<br>

Posterior = likelihood * proposition/evidedence or  P(A|B) = P(B|A) * P(A)/P(B)<br>
<b>For ex: In a deck of playing cards, a card is chosen. What is the probability of a card being queen given the card is a face card?</b><br>
This can be solved using Bayes theorem.<br>
P(Queen given Face card) = P(Queen|Face)<br> 
P(Face given Queen) = P(Face|Queen) = 1<br>
P(Queen) = 4/52 = 1/13
P(Face) = 3/13
From Bayes theore:<br>
P(Queen|Face) = P(Face|Queen) P(Queen)/P(Face) = 1/3<br>



For an input with several variables:<br>
P(y|x1, x2, ... xn) = P(x1, x2, ... xn|y)* P(y)/P(x1,x2, ...xn)<br>
with Naive Bayes we assume x1, x2 ... xn are independent of each other, i.e:<br>
P(x1, x2, ... xn|y) = P(x1|y) * P(x2|y) ... * P(xn|y)<br> 
The assumption in distribution of P(xi|y) give rise to different NBM. For example assuming Gaussian distribution will give rise to Gaussian Naive Bayes (GNB) or multinomial distribusion will give Multinomial Naive Bayes (MNB). 

Naive Bayes Model works particularly well with text classification and spam filtering. <b>Advantages</b> of working with NB algorithm are:
<ul>
    <li>Requires small amount of training data to learn the parameters</li>
    <li>Can be trained relatively fast compared to sophisticated models</li>
</ul>
Main <b>disadvantage</b> of NB Algorithm is:
<ul>
    <li>It's a decent classifier but a bad estimator</li>
    <li>It works well with discrete values but won't work with continuous values (can't be used in regression)</li>
</ul>

#### Dilemma of NB Algorithm
A challenging question which can be asked regarding NB algorithm is: although the condinal independence assumed in NB algorithm is hardly true in real life then howcome NB Algorithm work so well as classifier? 
I won't discuss the solution here, rather will direct you towards the resource which contains the solution (__[here](https://www.cs.unb.ca/~hzhang/publications/FLAIRS04ZhangH.pdf)__). In short the answer lies in distribution of dependencies rather than dependency, somehow due to distribution the effect of dependencies cancels out. 

#### Loss function for NB classification
NB classification uses a zero-one loss function. In this function error = number of incorrect classifications. Here accuracy of probability estimation is not taken into account by error function given that class with highest probability is predicted right. For example let's say there are two classes A and B, and different attributes (x1, x2, ... xn) are given. P(A|all atributes) = 0.95 and P(B|all atributes)=0.05 but NB might estimates P(A|all atributes) = 0.7 and P(B|all atributes) = 0.3. Here althogh estimates are far from accurate but classifiction is correct.

Let's move back to our analysis. The first two steps of defining and compiling the model are reduced to identifying and importing the model from sklearn (as sklearn gives as precompiled models).

### 2. Compiling the model
Since we are using sklearn's modules and classes we just need to import the precompiled classes. Sklearn gives the information of all the classes __[here](https://scikit-learn.org/stable/modules/classes.html)__.   


### 3. Fitting the model
In this step we generate our model fitting our dataset in the MultinomialNB. Inorder to look for the arguments which can be passed while fitting the model its advised to check the sklearn webpage of the module under use. For MNB it can be checked __[here](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB)__ 

### 4. Evaluating the model
Here we quantify the quality of our model. We use __[metrics](https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation)__ module from sklearn library to evaluate the predictions. 

## Tweaking the model
We have observed the accuracy of our model is over 60%. We can now play with our model to increase its' accuracy.

#### Trying different n-grams

#### Trying different Naive Bayes Algorithms 

How about using several different algorithms all at once!

### Improving the accuracy
We have tried using different n-grams and different Naive Bayes models but maximum accuracy lingers arround 60%. In order to improve our model let's try to change the way the BOW is created. Currently we created BOW with CountVectorizer which counts the occurance of the word in the text. More number of time a word occurs it becomes more important for classification. 


### TF-IDF: Term Frequency-Inverse Document Frequency
Let's use TF-IDF here product of term frequency and inverse document frequency is used. Term frequency is how frequently a terms has appeared in a document. Let's say a term appears f times in a document with d words. <br>
Term Frequency = f/d <br>
IDF is inverse document frequency. If a corpus contains N documents and the term of our interest appears only in D documents then IDF is:<br>
IDF = log(N/D)
TF-IDF is product of Term Frequncy and Inverse Document Frequency. <b>TF-IDF shows the rarity of a word in the corpus.</b> If a word is rare then probably its a signature word for a particular sentiment/information.  
