In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import re
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

In [2]:
import nltk
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
df = pd.read_csv('Sarcasm__Dataset.csv')
df.describe()

Unnamed: 0,is_sarcastic
count,26709.0
mean,0.438953
std,0.496269
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [6]:
def preprocess_text(headline):
    # Tokenization
    tokens = nltk.word_tokenize(headline)


    # Noise removal
    #tokens = [re.sub('[^A-Za-z]+', '', token) for token in tokens]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Elongated and truncated word
    tokens = [re.sub(r'(.)\1+', r'\1\1', token) for token in tokens]

    # Contraction replace
    contractions = {
        "aren't": "are not",
        "can't": "cannot",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "I'd": "I would",
        "I'll": "I will",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it's": "it is",
        "let's": "let us",
        "mustn't": "must not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "we'd": "we would",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "where's": "where is",
        "who'd": "who would",
        "who'll": "who will",
        "won't": "will not",
        "can't": "can not",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'t": " not",
        "'ve": " have",
        "'m": " am"
    }


    return ' '.join(tokens)

df['headline'] = df['headline'].apply(preprocess_text)

df['headline'].tail()
df['headline'].head()


0    former versac store clerk sue over secret 'bla...
1    the 'roseann ' reviv catch up to our thorni po...
2    mom start to fear son 's web seri closest thin...
3    boehner just want wife to listen , not come up...
4    j.k. rowl wish snape happi birthday in the mos...
Name: headline, dtype: object

In [7]:
!pip install afinn


Collecting afinn
  Downloading afinn-0.1.tar.gz (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m51.2/52.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25l[?25hdone
  Created wheel for afinn: filename=afinn-0.1-py3-none-any.whl size=53429 sha256=2514b3c081bddd4ad8dd8d160d7f6a031375bc1e3a8ba2b24fb8b36981e85045
  Stored in directory: /root/.cache/pip/wheels/b0/05/90/43f79196199a138fb486902fceca30a2d1b5228e6d2db8eb90
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


In [8]:
from afinn import Afinn

# Adding the positive and negative word count features
af = Afinn()

def count_pos_words(text):
    words = text.split()
    pos_count = sum([af.score(word) > 0 for word in words])
    return pos_count

def count_neg_words(text):
    words = text.split()
    neg_count = sum([af.score(word) < 0 for word in words])
    return neg_count

df['positive_word_count'] = df['headline'].apply(count_pos_words)
df['negative_word_count'] = df['headline'].apply(count_neg_words)

In [9]:
sid = SentimentIntensityAnalyzer()

text_column = df['headline']

def get_polarity_score(text):
    for text in text_column:
        tokens = nltk.word_tokenize(text)
        unigrams = list(nltk.ngrams(tokens, 1))
        bigrams = list(nltk.ngrams(tokens, 2))
        trigrams = list(nltk.ngrams(tokens, 3))
        ngrams = unigrams + bigrams + trigrams
        # Calculate the polarity score for each n-gram
        scores = []
        for ngram in ngrams:
            score = sid.polarity_scores(' '.join(ngram))
            scores.append(score['compound'])

        # Aggregate the polarity scores to obtain an overall score
        if len(scores) > 0:
            polarity_score = sum(scores) / len(scores)
        return polarity_score

df['polarity_score'] = df['headline'].apply(get_polarity_score)

In [21]:
# Compute the number of repeated ellipsis, question marks, and exclamation marks
df['ellipsis'] = df['headline'].apply(lambda x: len(re.findall('\.\.\.', x)))
df['question'] = df['headline'].apply(lambda x: len(re.findall('\?', x)))
df['exclamation'] = df['headline'].apply(lambda x: len(re.findall('!', x)))

# Compute the number of duplicated letters and vowel repeats
df['duplicated_letters'] = df['headline'].apply(lambda x: sum(1 for i in range(len(x)-1) if x[i]==x[i+1]))
df['vowel_repeats'] = df['headline'].apply(lambda x: sum(1 for i in range(len(x)-1) if x[i] in 'aeiou' and x[i]==x[i+1]))



In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['headline'])
X = X.toarray() # convert sparse matrix to dense numpy array
X = np.hstack((X, df[['positive_word_count','negative_word_count', 'ellipsis' ,'polarity_score', 'duplicated_letters', 'vowel_repeats']].values)) # concatenate new columns

y = df['is_sarcastic']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
import numpy as np

# Check for negative values in the X_train_features array
if np.any(X_train < 0):
    print("There are negative values in the data.")

# Replace negative values with zero
X_train[X_train < 0] = 0


In [25]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

accuracy_percent = accuracy * 100
print("Accuracy: {:.2f}%".format(accuracy_percent))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 83.53%


In [29]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

precision_percent = precision * 100
print("Precision: {:.2f}%".format(precision_percent))

recall_percent = recall * 100
print("Recall: {:.2f}%".format(recall_percent))

Precision: 81.79%
Recall: 80.39%


In [27]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, y_pred)

TP = confusion[1, 1]  # True Positives
TN = confusion[0, 0]  # True Negatives
FP = confusion[0, 1]  # False Positives
FN = confusion[1, 0]  # False Negatives

print("True Positives: {}".format(TP))
print("True Negatives: {}".format(TN))
print("False Positives: {}".format(FP))
print("False Negatives: {}".format(FN))


True Positives: 1886
True Negatives: 2576
False Positives: 420
False Negatives: 460
