**Machine learning method**

Import all necessary modules

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import string as s
import re
import matplotlib.pyplot as plt
import os

Read data and get visual analysis of first few data

In [2]:
#Read the data
df=pd.read_csv('news.csv')
#Display first 10 data
df[0:10]

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


Get train and test sets

In [3]:
#Set x as the text of news and y as the label
x=df.text
y=df.label
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2, random_state=7)

TfidfVectorizer

In [4]:
#Initialize TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit then transform train set, transform test set to document-term matrix
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

Initialise PassiveAggressiveClassifier and evaluate 

In [5]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

In [6]:
#Get various metric values
y_pred=pac.predict(tfidf_test)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,pos_label='REAL')
recall=recall_score(y_test, y_pred,pos_label='REAL')
print(f'Accuracy: {round(accuracy*100,2)}%')
print(f'Precision: {round(precision*100,2)}%')
print(f'recall: {round(recall*100,2)}%')

Accuracy: 92.98%
Precision: 92.45%
recall: 93.48%


In [7]:
#Confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[590,  48],
       [ 41, 588]], dtype=int64)

In [8]:
inp = "Nasa is installing internet on the moon"
inp = tfidf_vectorizer.transform([inp])
print(pac.predict(inp))

['FAKE']


In [9]:
inp = "Trump says an indictment would not end US presidential campaign"
inp = tfidf_vectorizer.transform([inp])
print(pac.predict(inp))

['REAL']


Use other classification algorithms (MultinomialNB)

In [10]:
mnb = MultinomialNB()
mnb.fit(tfidf_train,y_train)

In [11]:
#Get various metric values
y_pred=mnb.predict(tfidf_test)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,pos_label='REAL')
recall=recall_score(y_test, y_pred,pos_label='REAL')
print(f'Accuracy: {round(accuracy*100,2)}%')
print(f'Precision: {round(precision*100,2)}%')
print(f'recall: {round(recall*100,2)}%')

Accuracy: 84.06%
Precision: 76.59%
recall: 97.77%


In [12]:
#Confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[450, 188],
       [ 14, 615]], dtype=int64)

**Clickbait detection**

Read data and get visual analysis of first few data

In [13]:
cb=pd.read_csv('clickbait.csv')
cb[0:10]

Unnamed: 0,title,label
0,"15 Highly Important Questions About Adulthood,...",1
1,250 Nuns Just Cycled All The Way From Kathmand...,1
2,"Australian comedians ""could have been shot"" du...",0
3,Lycos launches screensaver to increase spammer...,0
4,Fußball-Bundesliga 2008–09: Goalkeeper Butt si...,0
5,"In Afghanistan, Soldiers Bridge 2 Stages of War",0
6,"After Fleeing North Korea, an Artist Parodies ...",0
7,Lessons (or Not) When a Start-Up Misses the Mark,0
8,Court Issues Order Against 3 Car-Warranty Call...,0
9,How Much Would Chris Traeger Like You Based On...,1


In [14]:
#Set x as the text of news and y as the label
x=cb.title
y=cb.label
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2, random_state=7)

Count Vectorizer

In [15]:
cov=CountVectorizer(analyzer='word', ngram_range=(1,2),max_features=22500)
cov_train=cov.fit_transform(x_train)
cov_test=cov.transform(x_test)
train_arr=cov_train.toarray()
test_arr=cov_test.toarray()

Multinomial Naïve Bayes

In [16]:
mnb = MultinomialNB()
mnb.fit(train_arr,y_train)

In [17]:
y_pred=mnb.predict(test_arr)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,pos_label=1)
recall=recall_score(y_test, y_pred,pos_label=1)
print(f'Accuracy: {round(accuracy*100,2)}%')
print(f'Precision: {round(precision*100,2)}%')
print(f'recall: {round(recall*100,2)}%')

Accuracy: 97.33%
Precision: 96.61%
recall: 98.04%


In [18]:
#Confusion matrix
confusion_matrix(y_test,y_pred, labels=[0,1])

array([[3121,  109],
       [  62, 3106]], dtype=int64)

Passive Aggressive Classifier

In [19]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(train_arr,y_train)

In [20]:
#Get various metric values
y_pred=pac.predict(test_arr)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,pos_label=1)
recall=recall_score(y_test, y_pred,pos_label=1)
print(f'Accuracy: {round(accuracy*100,2)}%')
print(f'Precision: {round(precision*100,2)}%')
print(f'recall: {round(recall*100,2)}%')

Accuracy: 97.19%
Precision: 97.79%
recall: 96.5%


In [21]:
#Confusion matrix
confusion_matrix(y_test,y_pred, labels=[0,1])

array([[3161,   69],
       [ 111, 3057]], dtype=int64)

In [22]:
inp = "Top 15 things you need to know to be rich"
inp = cov.transform([inp])
print(mnb.predict(inp))

[1]


In [23]:
inp = "Asia China increases military spending in face of 'escalating' threats"
inp = cov.transform([inp])
print(mnb.predict(inp))

[0]


**Hate speech detection**

Read data and get visual analysis of first few data

In [24]:
hs_data= pd.read_csv('hatespeech.csv')
hs_data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [25]:
hs_data["labels"] = hs_data["class"]. map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate and Offensive Speech"})
hs_data = hs_data[["tweet", "labels"]]
print(hs_data. head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                         labels  
0  No Hate and Offensive Speech  
1              Offensive Speech  
2              Offensive Speech  
3              Offensive Speech  
4              Offensive Speech  


In [26]:
x=hs_data.tweet
y=hs_data.labels
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2, random_state=7)

Data processing steps

In [27]:
def tokenization(text):
    tokens=text.split()
    return tokens
x_train=x_train.apply(tokenization)
x_test=x_test.apply(tokenization)

In [28]:
def lowercasing(tokens):
    new_list=[]
    for i in tokens:
        i=i.lower()
        new_list.append(i)
    return new_list
x_train=x_train.apply(lowercasing)
x_test=x_test.apply(lowercasing)  

In [29]:
def remove_stopwords(words):
    stop_words=stopwords.words('english')
    new_list=[]
    for i in words:
        if i not in stop_words:
            new_list.append(i)
    return new_list

x_train=x_train.apply(remove_stopwords)
x_test=x_test.apply(remove_stopwords)

In [30]:
def remove_punctuations(words):
    new_list=[]
    for i in words:
        for j in s.punctuation:
            i=i.replace(j,'')
        new_list.append(i)
    return new_list
x_train=x_train.apply(remove_punctuations)
x_test=x_test.apply(remove_punctuations)  

In [31]:
def remove_numbers(words):
    no_numbers=[]
    new_list=[]
    for i in words:
        for j in s.digits:    
            i=i.replace(j,'')
        no_numbers.append(i)
    for i in no_numbers:
        if i!='':
            new_list.append(i)
    return new_list
x_train=x_train.apply(remove_numbers)
x_test=x_test.apply(remove_numbers)

In [32]:
def remove_spaces(words):
    new_list=[]
    for i in words:
        i=i.strip()
        new_list.append(i)
    return new_list
x_train=x_train.apply(remove_spaces)
x_test=x_test.apply(remove_spaces)

In [33]:
lemmatizer=nltk.stem.WordNetLemmatizer()
def lemmatization(words):
    new_list=[]
    for i in words:
        i=lemmatizer.lemmatize(i)
        new_list.append(i)
    return new_list
x_train=x_train.apply(lemmatization)
x_test=x_test.apply(lemmatization)

In [34]:
x_train=x_train.apply(lambda x: ''.join(i+' ' for i in x))
x_test=x_test.apply(lambda x: ''.join(i+' ' for i in x))


TfidfVectorizer

In [35]:
#Initialize TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit then transform train set, transform test set to document-term matrix
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

Passive Aggressive Classifier

In [36]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

In [37]:
y_pred=pac.predict(tfidf_test)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,average='weighted')
recall=recall_score(y_test, y_pred,average='weighted')
print(f'Accuracy: {round(accuracy*100,2)}%')
print(f'Precision: {round(precision*100,2)}%')
print(f'recall: {round(recall*100,2)}%')

Accuracy: 87.37%
Precision: 87.17%
recall: 87.37%


Multinomial Naïve Bayes

In [38]:
mnb = MultinomialNB()
mnb.fit(tfidf_train,y_train)

In [39]:
y_pred=mnb.predict(tfidf_test)
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,average='weighted')
recall=recall_score(y_test, y_pred,average='weighted')
print(f'Accuracy: {round(accuracy*100,2)}%')
print(f'Precision: {round(precision*100,2)}%')
print(f'recall: {round(recall*100,2)}%')

Accuracy: 80.05%
Precision: 77.5%
recall: 80.05%


  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
inp = "You are a nice person"
inp = tfidf_vectorizer.transform([inp])
print(pac.predict(inp))

['No Hate and Offensive Speech']


In [41]:
inp = "You are a fat person"
inp = tfidf_vectorizer.transform([inp])
print(pac.predict(inp))

['Hate Speech']


In [42]:
inp = "You have a bad attitude"
inp = tfidf_vectorizer.transform([inp])
print(pac.predict(inp))

['Offensive Speech']
