In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import json

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('/content/Sarcasm_Headlines_Dataset_v2.json'))
data[:5]

[{'is_sarcastic': 1,
  'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
  'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205'},
 {'is_sarcastic': 0,
  'headline': 'dem rep. totally nails why congress is falling short on gender, racial equality',
  'article_link': 'https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207'},
 {'is_sarcastic': 0,
  'headline': 'eat your veggies: 9 deliciously different recipes',
  'article_link': 'https://www.huffingtonpost.com/entry/eat-your-veggies-9-delici_b_8899742.html'},
 {'is_sarcastic': 1,
  'headline': 'inclement weather prevents liar from getting to work',
  'article_link': 'https://local.theonion.com/inclement-weather-prevents-liar-from-getting-to-work-1819576031'},
 {'is_sarcastic': 1,
  'headline': "mother comes pretty close to using word 'streaming' correctly",
  'article_link': 'https://www.theonion.com/mother-comes-pretty-

In [3]:
#I will use only 2000 sample from the data because it takes too long to get every article from the link
data = data[:2000]
len(data)

2000

In [4]:
import csv
import requests
from bs4 import BeautifulSoup

article_texts = []
for row in data:
  try:
    # Extract the headline and URL for this row
    headline = row['headline']
    url = row['article_link']
    is_sarcastic = row["is_sarcastic"]
    
    # Send a GET request to the URL and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content
    
    # Parse the HTML content with Beautiful Soup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the <article> tag that contains the article content
    article = soup.find('article')
    
    article_text = headline
    # Extract the text content from the article
    if(article):
      article_text = article.get_text()
    article_texts.append(article_text)
  except:
    headline = row['headline']
    url = row['article_link']
    is_sarcastic = row["is_sarcastic"]
    article_texts.append(headline)    
    continue

In [5]:
article_texts[:5]

['thirtysomething scientists unveil doomsday clock of hair loss',
 'Rep. Donna Edwards (D-Md.) specifically criticized Democrats for not being inclusive of the voters who have traditionally been loyal to them.The Washington Post via Getty Images"We are neither post-racial nor post-gender," Rep. Donna Edwards (D-Md.) proclaims in a new essay taking on Congress for not addressing inequality. One of the major barriers is that America\'s elected officials don\'t represent the most disenfranchised populations, Edwards argues in the Cosmopolitan magazine piece, published on Tuesday.Advertisement\n\n"Can we pass equal pay laws and give women control of their own health-care decisions when women represent just 20 percent of Congress?" she says. Even when women get elected to office, men still hold the large majority of leadership positions, The New York Times pointed out last year.Edwards, who lost a Senate primary race to Rep. Chris Van Hollen (D-Md.), specifically criticized Democrats for no

In [6]:
for index,text in enumerate(article_texts):
  data[index]["article_link"] = article_texts[index]
data[:5]

[{'is_sarcastic': 1,
  'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
  'article_link': 'thirtysomething scientists unveil doomsday clock of hair loss'},
 {'is_sarcastic': 0,
  'headline': 'dem rep. totally nails why congress is falling short on gender, racial equality',
  'article_link': 'Rep. Donna Edwards (D-Md.) specifically criticized Democrats for not being inclusive of the voters who have traditionally been loyal to them.The Washington Post via Getty Images"We are neither post-racial nor post-gender," Rep. Donna Edwards (D-Md.) proclaims in a new essay taking on Congress for not addressing inequality. One of the major barriers is that America\'s elected officials don\'t represent the most disenfranchised populations, Edwards argues in the Cosmopolitan magazine piece, published on Tuesday.Advertisement\n\n"Can we pass equal pay laws and give women control of their own health-care decisions when women represent just 20 percent of Congress?" she says. 

In [7]:
headlines = []
article_texts = article_texts
is_sarcastic = []

for index,dic in enumerate(data):
  headlines.append(dic["headline"])
  is_sarcastic.append(dic["is_sarcastic"])

all_data = pd.DataFrame({"headlines":headlines,
                         "article_texts":article_texts,
                         "is_sarcastic":is_sarcastic})


In [8]:
all_data.head()

Unnamed: 0,headlines,article_texts,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,Rep. Donna Edwards (D-Md.) specifically critic...,0
2,eat your veggies: 9 deliciously different recipes,Vegetables don't have to be boring or relegate...,0
3,inclement weather prevents liar from getting t...,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,mother comes pretty close to using word 'strea...,1


In [9]:
y = all_data["is_sarcastic"]
X = all_data.drop("is_sarcastic",axis=1)

X.head()

Unnamed: 0,headlines,article_texts
0,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,dem rep. totally nails why congress is falling...,Rep. Donna Edwards (D-Md.) specifically critic...
2,eat your veggies: 9 deliciously different recipes,Vegetables don't have to be boring or relegate...
3,inclement weather prevents liar from getting t...,inclement weather prevents liar from getting t...
4,mother comes pretty close to using word 'strea...,mother comes pretty close to using word 'strea...


In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

len(X_train),len(X_test),len(y_train),len(y_test)

(1600, 400, 1600, 400)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer(stop_words='english',max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train['headlines'] + ' ' + X_train['article_texts'])
X_test_vec = vectorizer.transform(X_test['headlines'] + ' ' + X_test['article_texts'])

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Make predictions on testing data
y_pred = model.predict(X_test_vec)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 95.50%


In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# Make predictions on testing data
y_pred = nb_model.predict(X_test_vec)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 82.50%
              precision    recall  f1-score   support

           0       0.76      0.99      0.86       215
           1       0.98      0.63      0.77       185

    accuracy                           0.82       400
   macro avg       0.87      0.81      0.81       400
weighted avg       0.86      0.82      0.82       400



In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)

# Make predictions on testing data
y_pred = rf_model.predict(X_test_vec)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 96.25%
              precision    recall  f1-score   support

           0       1.00      0.93      0.96       215
           1       0.93      1.00      0.96       185

    accuracy                           0.96       400
   macro avg       0.96      0.97      0.96       400
weighted avg       0.97      0.96      0.96       400



In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Train SVM model
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train_vec, y_train)

# Make predictions on testing data
y_pred = svm_model.predict(X_test_vec)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 95.75%
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       215
           1       0.92      1.00      0.96       185

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400

