**SARCASM DETECTION WITH 8 MACHINE LEARNING ALGORITHMS**

**What we have to do?** \
    **We have a dataset of sarcastic headlines from two news websites** \
    **There is each dataset, which consists of three attributes:** \
        - is_sarcastic: 1 if the dataset is sarcastic, 0 otherwise\
        - headline: the headline of the news article\
        - article_link: Link to the original news article
        
  **We need to find out which news items are sarcastic and which are not.**\
  **For this task we need to work with the following plan:**
  
    >>> Read and clean the data
    >>> Find and visualise most common words
    >>> Identify and visualise stop words
    >>> Preprocess text
    >>> Define classification models
    >>> Retrieve and save the best model

   

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Import library necessary**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('stopwords')

**Reading and Cleaning the Data**

In [None]:
# read the file
data = pd.read_json("/content/drive/MyDrive/Sarcasm_Headlines_Dataset.json", lines=True)

In [None]:
# view the data
data.head(100)

In [None]:
# shape of the data
data.shape

In [None]:
data_len = data['headline'].apply(lambda x: len(x.split(' '))).sum()
print(f'We have {data_len} words in the headline')

In [None]:
# check the columns names
data.columns

In [None]:
# check the data types in the columns
data.dtypes

In [None]:
#checking the unique values in 'is_sarcastic' column
data.is_sarcastic.unique()

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
#checking the value counts in 'is_sarcastic' column
data.is_sarcastic.value_counts()
sns.countplot(data['is_sarcastic'].value_counts())
plt.show()

In [None]:
# check the null values in data
data.isna().sum()

In [None]:
#drop 'article_link' column
data = data.drop('article_link', axis=1)

In [None]:
#ckeck the data
data.head(10)

In [None]:
#import necessary library
import re
from nltk.corpus import stopwords

set_stopwords = set(stopwords.words("english"))


def clean_txt(text): # define the fuction with tokenization/string cleaning for all datasets

    text = re.sub(r"[^A-Za-z,!?]", " ", text)
    text = re.sub(r'\[[^]]*\]'," ", text)
    text = re.sub(r"\'s", "", text)
    text = re.sub(r"\'t", "", text )
    text = re.sub(r"\'re", "",text)
    text = re.sub(r"\'d", "", text)
    text = re.sub(r"\'ll", " ",text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\(", " ", text)
    text = re.sub(r"\)", " ", text)
    text = re.sub(r"\'", " ", text)
    text = re.sub(r"aa", "", text)
    text = re.sub(r"zz", "", text)
    text = re.sub(r"[0-9]", ' ', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in set_stopwords)
    return text

data['headline'] = data['headline'].apply(clean_txt)

In [None]:
data_clean_len = data['headline'].apply(lambda x: len(x.split(' '))).sum()
print(f'After text cleaning we have only {data_clean_len} words to work with')

*Most common words*

In [None]:
from collections import Counter #import Counter for finding most common words
import seaborn as sns #import searbon for vizualization result

text = data['headline']
words = text.str.split(expand=True).unstack()
result_count = Counter(words).most_common()
result_df = pd.DataFrame(result_count).reset_index().drop(0) #converting to Dataframe and drop the Nones values
#result_df
#vizualize result
sns.set_theme(style="whitegrid")
f, ax = plt.subplots(figsize=(15, 15))
sns.barplot(y=result_df[0][0:30], x=result_df[1][0:30], data=result_df, palette=None)
plt.ylabel('Words', color="blue")  # Add an x-label to the axes.
plt.xlabel('Count', color="blue")  # Add a y-label to the axes.
plt.title("Frequent Occuring words in Headlines", color="blue")
plt.xticks(rotation=50);
ax.tick_params(axis='x', colors='black')
plt.show()

**Finding most common words in 'is_sarcastic' column**

In [None]:
#create DataFrame for sarcastic words
sarcastic = pd.DataFrame(data[data['is_sarcastic']==1]['headline'].str.split(expand=True).unstack().value_counts()).reset_index()

In [None]:
#create DataFrame for non_sarcastic words
non_sarcastic = pd.DataFrame(data[data['is_sarcastic']==0]['headline'].str.split(expand=True).unstack().value_counts()).reset_index()

In [None]:
#vizualize result
sns.set_theme(style="whitegrid")
f, ax = plt.subplots(figsize=(15, 10))
sns.barplot(y=sarcastic['index'][0:30], x=sarcastic[0][0:30], data=result_df, palette=None)
plt.ylabel('Words', color="blue")  # Add an x-label to the axes.
plt.xlabel('Count', color="blue")  # Add a y-label to the axes.
plt.title("Frequent Occuring Sarcastic Words in Headlines", color="blue")
plt.xticks(rotation=70);
plt.show()

In [None]:
#vizualize result
sns.set_theme(style="whitegrid")
f, ax = plt.subplots(figsize=(15, 10))
sns.barplot(y=non_sarcastic['index'][0:30], x=non_sarcastic[0][0:30], data=result_df, palette=None)
plt.ylabel('Words', color="blue")  # Add an x-label to the axes.
plt.xlabel('Count', color="blue")  # Add a y-label to the axes.
plt.title("Frequent Occuring Non_Sarcastic Words in Headlines", color="blue")
plt.xticks(rotation=70);
plt.show()

**WordCloud Vizualization with StopWords**

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

sarcasctic_2 = [every_word.lower() for every_word in sarcastic['index']]

sarc_nonstop = [word for word in sarcasctic_2 if word not in stopwords]

non_sarcasctic_2 = [every_word.lower() for every_word in non_sarcastic['index']]

non_sarc_nonstop = [word for word in non_sarcasctic_2 if word not in stopwords]


In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

plt.figure(figsize=(15,10))
wordcloud = WordCloud(width=1000, height=500,
                      max_words=300, min_font_size = 10,
                      background_color="black",
                      stopwords = stopwords,
                      ).generate(' ' .join(word for word in sarc_nonstop))

plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Wordcloud of Sarcactic Words', color="black")
plt.axis("off")
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS

plt.figure(figsize=(15,10))
wordcloud = WordCloud(width=1000, height=500,
                      max_words=300, min_font_size = 10,
                      background_color="black",
                      stopwords = stopwords,
                      ).generate(' ' .join(word for word in non_sarc_nonstop))

plt.imshow(wordcloud, interpolation='spline36')
plt.title('Wordcloud of Non_Sarcactic Words', color="black")
plt.axis("off")
plt.show()

***Text pre-processing***

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_cv = cv.fit_transform(data['headline']).toarray()
y = data.iloc[:, -1].values
cv.get_feature_names_out()
df = pd.DataFrame(X_cv, columns=cv.get_feature_names_out())
df

**Split text to train and test**

In [None]:
from sklearn.model_selection import train_test_split # import library for train_test_split
X = text
y = data.is_sarcastic
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 0)

**Multinomial Naive Bayes Classifier**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import f1_score, accuracy_score, classification_report


pipe1 = Pipeline([('vectr', CountVectorizer(analyzer='word', preprocessor=None, min_df=1,)),
                   ('MNB', MultinomialNB()),])

parameters1 = {'vectr__ngram_range': [(1,1),(1,2),(1,3)],
                    'MNB__alpha': [0.05,0.1,0.5,1.0,2.0,4.0],
                'MNB__fit_prior': [True]}
gridMNB = GridSearchCV(pipe1, parameters1 ,cv=7,n_jobs=-1, verbose=3)
gridMNB.fit(X_train, y_train)

%time y_pred1 = gridMNB.predict(X_test)

#getting the best accuracy and parameters
print('MNB_Train Accuracy : %.3f'%gridMNB.best_estimator_.score(X_train, y_train))
print('MNB_Test Accuracy : %.3f'%gridMNB.best_estimator_.score(X_test, y_test))
print('MNB_Best Accuracy Through Grid Search : %.3f'%gridMNB.best_score_)
print('MNB_Best Parameters : ',gridMNB.best_params_)
print(15*'--->--->')
print('classification_report: \n', classification_report(y_test, y_pred1))

**Stochastic Gradient Descent Classifier**

In [None]:
from sklearn.linear_model import SGDClassifier

pipe2 = Pipeline([('vectr', CountVectorizer(analyzer='word', preprocessor=None, min_df=1)),
                  ('tfidf', TfidfTransformer()),
                  ('SGD', SGDClassifier(loss='modified_huber', penalty='l2',alpha=0.0001,shuffle=True,
                                      learning_rate='optimal',random_state=None, max_iter=100,)),
                   ])
parameters2 = {'vectr__ngram_range': [(1,1),(1,2)],
                    'SGD__alpha': [0.0001,0.01,1,10],
                'SGD__max_iter': [1,10,100,1000],
                  'SGD__loss': ['modified_huber'],
                  'SGD__penalty': ['l2']}


gridSGD = GridSearchCV(pipe2, parameters2 ,cv=8, n_jobs=-1, verbose=3)
gridSGD.fit(X_train, y_train)

%time y_pred2 = gridSGD.predict(X_test)

print('SGD_Train Accuracy : %.3f'%gridSGD.best_estimator_.score(X_train, y_train))
print('SGD_Test Accuracy : %.3f'%gridSGD.best_estimator_.score(X_test, y_test))
print('SGD_Best Accuracy Through Grid Search : %.3f'%gridSGD.best_score_)
print('SGD_Best Parameters : ',gridSGD.best_params_)
print(15*'--->--->')
print('classification_report: \n',  classification_report(y_test, y_pred2))

**KNeighbors Classifier**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipe3 = Pipeline([('vectr', CountVectorizer(analyzer='word', preprocessor=None, min_df=1)),
                  ('tfidf', TfidfTransformer()),
                  ('KN', KNeighborsClassifier(n_neighbors=5, algorithm='brute', weights='uniform', metric='minkowski')),
                   ])


parameters3 = {'vectr__ngram_range': [(1,1),(1,2)],
                    'KN__n_neighbors': [15,30,45,60,75,90],
                    'KN__p': [2],
                    'KN__leaf_size': [10,20]
              }


gridKN = GridSearchCV(pipe3, parameters3, scoring='accuracy',cv=6, n_jobs=-1, verbose=3)
gridKN.fit(X_train, y_train)

%time y_pred3 = gridKN.predict(X_test)
print('KN_Train Accuracy : %.3f'%gridKN.best_estimator_.score(X_train, y_train))
print('KN_Test Accuracy : %.3f'%gridKN.best_estimator_.score(X_test, y_test))
print('KN_Best Accuracy Through Grid Search : %.3f'%gridKN.best_score_)
print('KN_Best Parameters : ',gridKN.best_params_)
print(15*'--->--->')
print('classification_report: \n',  classification_report(y_test, y_pred3))


**Logistic Regression Classifier**

In [None]:
from sklearn.linear_model import LogisticRegression

pipe4 = Pipeline([('vectr', CountVectorizer(analyzer='word', preprocessor=None, min_df=1)),
                ('tfidf', TfidfTransformer()),
                ('LR', LogisticRegression(penalty='l2',C=1.0,random_state=None,
                                          solver='liblinear',intercept_scaling=1, max_iter=100,)),
               ])

parameters4 = {'vectr__ngram_range': [(1,1),(1,2)],
                    'LR__C': [3,4,5,6,7],
                    'LR__tol': [0.0001,0.01,0.1],
                  'LR__max_iter': [50,75,100]
              }

gridLR = GridSearchCV(pipe4, parameters4 ,cv=4, verbose=1, n_jobs=-1)
gridLR.fit(X_train, y_train)

%time y_pred4 = gridLR.predict(X_test)
print('LR_Train Accuracy : %.3f'%gridLR.best_estimator_.score(X_train, y_train))
print('LR_Test Accuracy : %.3f'%gridLR.best_estimator_.score(X_test, y_test))
print('LR_Best Accuracy Through Grid Search : %.3f'%gridLR.best_score_)
print('LR_Best Parameters : ',gridLR.best_params_)
print(15*'--->--->')
print('classification_report: \n',  classification_report(y_test, y_pred4))

**Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipe5 = Pipeline([('vectr', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('DTree', DecisionTreeClassifier(criterion='gini', splitter='best',random_state=0 ,min_samples_leaf=1)),
               ])

parameters5 = {'vectr__ngram_range': [(1,1),(1,2)],
               'DTree__criterion' : ['gini'],
               'DTree__max_features': ['auto'],
               'DTree__max_depth': [2, 3, 5, 10, 15],
               'DTree__min_samples_split': [2, 3, 5, 7, 9],
               'DTree__min_samples_leaf': [1,5,8,11],
              }

gridDtree = GridSearchCV(pipe5, parameters5 ,cv=5)
gridDtree.fit(X_train, y_train)

%time y_pred5 = gridDtree.predict(X_test)
print('Dtree_Train Accuracy : %.3f'%gridDtree.best_estimator_.score(X_train, y_train))
print('Dtree_Test Accuracy : %.3f'%gridDtree.best_estimator_.score(X_test, y_test))
print('Dtree_Best Accuracy Through Grid Search : %.3f'%gridDtree.best_score_)
print('Dtree_Best Parameters : ',gridDtree.best_params_)
print(15*'--->--->')
print('classification_report: \n',  classification_report(y_test, y_pred5))

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe6 = Pipeline([('vectr', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('RF', RandomForestClassifier(n_estimators=100,criterion='gini', max_features='auto',
                                               random_state=None ,min_samples_leaf=1)),
               ])

parameters6 = {'vectr__ngram_range': [(1,1),(1,2)],
               'RF__n_estimators': [100,200],
                'RF__criterion': ['gini'],
               'RF__min_samples_split': [2],
               'RF__min_samples_leaf': [1],
              }

gridRF = GridSearchCV(pipe6, parameters6 ,cv=8, verbose=1, n_jobs=-1)
gridRF.fit(X_train, y_train)

%time y_pred6 = gridRF.predict(X_test)
print('RF_Train Accuracy : %.3f'%gridRF.best_estimator_.score(X_train, y_train))
print('RF_Test Accuracy : %.3f'%gridRF.best_estimator_.score(X_test, y_test))
print('RF_Best Accuracy Through Grid Search : %.3f'%gridRF.best_score_)
print('RF_Best Parameters : ',gridRF.best_params_)
print(15*'--->--->')
print('classification_report: \n',  classification_report(y_test, y_pred6))

**Support Vector Classification**

In [None]:
from sklearn.svm import SVC

pipe7 = Pipeline([('vectr', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('SVC', SVC(C=1.0, kernel='rbf', max_iter=-1,
                                random_state=None ,cache_size=200)),
               ])

parameters7 = {'vectr__ngram_range': [(1,1),(1,2)],
               'SVC__C': [0.1,1,10],
                'SVC__kernel': ['rbf'],
              'SVC__degree': [1,2,3],
               'SVC__cache_size': [50,100,200]
              }

gridSVC = GridSearchCV(pipe7, parameters7 ,cv=3, verbose=3, n_jobs=-1)
gridSVC.fit(X_train, y_train)

%time y_pred7 = gridSVC.predict(X_test)
print('SVC_Train Accuracy : %.3f'%gridSVC.best_estimator_.score(X_train, y_train))
print('SVC_Test Accuracy : %.3f'%gridSVC.best_estimator_.score(X_test, y_test))
print('SVC_Best Accuracy Through Grid Search : %.3f'%gridSVC.best_score_)
print('SVC_Best Parameters : ',gridSVC.best_params_)
print(15*'--->--->')
print('classification_report: \n',  classification_report(y_test, y_pred7))

**Gradient Boosting Classifier**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


pipe8 = Pipeline([('vectr', CountVectorizer(analyzer='word', preprocessor=None, min_df=1)),
                  ('tfidf', TfidfTransformer()),
                  ('BST', GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, random_state=0)),
                   ])


parameters8 = {'vectr__ngram_range': [(1,1),(1,2)],
                    'BST__n_estimators': [50,100,200],
                    'BST__max_depth': [3,4,5],
                    'BST__learning_rate': [0.05,0.01,0.5,1.0]
              }


gridBoost = GridSearchCV(pipe8, parameters8, cv=3, verbose=3, n_jobs=-1)
gridBoost.fit(X_train, y_train)

%time y_pred8 = gridBoost.predict(X_test)
print('Boost_Train Accuracy : %.3f'%gridBoost.best_estimator_.score(X_train, y_train))
print('Boost_Test Accuracy : %.3f'%gridBoost.best_estimator_.score(X_test, y_test))
print('Boost_Best Accuracy Through Grid Search : %.3f'%gridBoost.best_score_)
print('Boost_Best Parameters : ',gridBoost.best_params_)
print(15*'--->--->')
print('classification_report: \n',  classification_report(y_test, y_pred7))

In [None]:
from sklearn import  metrics # import library for getting scores

models = []
models.append(('MNB', gridMNB.best_estimator_))
models.append(('SGD', gridSGD.best_estimator_))
models.append(('KN', gridKN.best_estimator_))
models.append(('LR', gridLR.best_estimator_))
models.append(('Dtree', gridDtree.best_estimator_))
models.append(('RF', gridRF.best_estimator_))
models.append(('SVC', gridSVC.best_estimator_))
models.append(('BST', gridBoost.best_estimator_))
precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
roc_auc_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test , model.predict(X_test), average='weighted') ))
    print("recall_score: {}".format( metrics.recall_score(y_test , model.predict(X_test), average='weighted') ))
    print("f1_score: {}".format( metrics.f1_score(y_test , model.predict(X_test), average='weighted') ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test , model.predict(X_test)) ))


    precision_score.append(metrics.precision_score(y_test , model.predict(X_test), average='weighted') )
    recall_score.append(metrics.recall_score(y_test , model.predict(X_test), average='weighted') )
    f1_score.append( metrics.f1_score(y_test , model.predict(X_test), average='weighted') )
    accuracy_score.append(metrics.accuracy_score(y_test , model.predict(X_test)))


In [None]:
#creat the Data Frame for scores in models
import pandas as pd
scores = {'precision_score': precision_score,
     'recall_score': recall_score,
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score
    }
df = pd.DataFrame(data=scores)
df.insert(loc=0, column='Model', value=['MNB','SGD','KNN','LogReg','RandFor','DTree','SVC', 'BST'])
df.sort_values('accuracy_score',ascending=False)

In [None]:
#creat the Data Frame for accuracy scores
acc = {
     'accuracy_score' : accuracy_score
    }
df = pd.DataFrame(data=acc)
df.insert(loc=0, column='Model', value=['MNB','SGD','KNN','LogReg','RandFor','DTree','SVC','BST'])
#df

In [None]:
plt.figure(figsize = (16,10)) #plot the accuracy scores
sns.barplot(x=df['Model'], y=df['accuracy_score'], data=df)
plt.show()

In [None]:
# Using counfusion matrix for best model(MultinomialNB )
from sklearn.metrics import confusion_matrix,classification_report
conf_mtx=confusion_matrix(y_pred1,y_test)

plt.figure(figsize=(8,6))
sns.heatmap(conf_mtx,annot=True,fmt='d',cmap='Blues')
plt.title("0 - Non_sarcastic     1 - Sarcastic")
plt.show()