In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

### Reading the dataset

In [2]:
news_df = pd.read_csv("news.csv")

In [3]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
news_df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

### since no null values we can continue building our model

In [5]:
target = news_df['label']


In [6]:
target.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [7]:
news_df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

## PreProcessing

In [9]:
news_df['text'] = news_df.title + " " + news_df.text
news_df.drop(columns=['Unnamed: 0', 'title'], axis=1, inplace=True)

In [10]:
news_df.head()

Unnamed: 0,text,label
0,You Can Smell Hillary’s Fear Daniel Greenfield...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,Kerry to go to Paris in gesture of sympathy U....,REAL
3,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,The Battle of New York: Why This Primary Matte...,REAL


In [None]:
import string
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words("english"))


def process_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    # Normaliser les liens en les remplaçant par  'link'.
    text = re.sub('http\S+', 'link', text)

    # Normaliser les nombres en les remplaçant par la chaîne «nombre».
    text = re.sub('\d+', 'number', text)

    # Normaliser les e-mails en les remplaçant par le str 'email'.
    text = re.sub('\S+@\S+', 'email', text, flags=re.MULTILINE)
    
    # Remove punctuation.    
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove whitespaces.
    text = text.strip()
    
    # Convertissez toutes les lettres en minuscules.
    text = text.lower()
    
    stemmer = SnowballStemmer('english')
    
    # Divisez le texte en mots.
    words = text.split()
    
    # Remove stopwords.
    words = [w for w in words if w not in stopwords.words('english')]
    
    
    return ' '.join(words)

# Clean Comments

news_df["comment_clean"] = news_df["text"].map(
    lambda x: process_text(x) if isinstance(x, str) else x

)

Splitting the dataset into test train and using cross validation for train set so that the model will see the train set for 5 different times and learn more appropriately 

In [8]:
x_train,x_test,y_train,y_test=train_test_split(news_df['text'], target, test_size=0.3, random_state=143)

In [87]:
x_train.shape

(4434,)

Initializing models from sklearn

In [88]:
rf_model = RandomForestClassifier()
svc_model = SVC()

Initializinf tf-idf with stop words from the English language and a maximum document frequency of 0.75. And a TfidfVectorizer turns a collection of raw documents into a matrix of TF-IDF features.

In [89]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.75)
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [90]:
y_train[:5]

1742    REAL
604     REAL
4031    REAL
1733    REAL
5852    REAL
Name: label, dtype: object

Splitting the train set for cross validation

In [91]:
kf = KFold(n_splits= 7 , shuffle= True , random_state = 143)

In [92]:
cross_val = cross_val_score(rf_model,tfidf_train,y_train,cv=7 )

In [93]:
cross_val

array([0.90063091, 0.89116719, 0.90378549, 0.89415482, 0.89415482,
       0.90363349, 0.89099526])

In [94]:
cross_val_svm  = cross_val_score(svc_model,tfidf_train,y_train,cv=7 )

In [95]:
cross_val_svm

array([0.92586751, 0.91640379, 0.92586751, 0.91943128, 0.92733017,
       0.92417062, 0.92890995])

SVM seems to perform better than random forest, so lets fit the data with svm model

In [96]:
for train_index, test_index in kf.split(tfidf_train):
    #print('TRAIN:', train_index, 'TEST:', test_index)
    x_tra = tfidf_train[train_index]
    y_tra = y_train.iloc[train_index]
    x_tes = tfidf_train[test_index]
    y_tes = y_train.iloc[test_index]
    svc_model.fit(x_tra, y_tra)
    y_pre = svc_model.predict(x_tes)
    print(accuracy_score(y_pre,y_tes))

0.9148264984227129
0.916403785488959
0.9132492113564669
0.9368088467614534
0.9131121642969984
0.919431279620853
0.9178515007898894


In [97]:
y_pred = svc_model.predict(tfidf_test)

In [98]:
accuracy_score(y_pred, y_test)

0.9221462388216728

In [101]:
c_m = confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])
c_m

array([[902,  46],
       [102, 851]], dtype=int64)

here we can see the confusion matrix with 902 TP, 851 TN, 46 FP and 102 TN

In [100]:
# precision = percentage of postives predicted are actually positive
# recall = percentage of actual postivites predicted
precision = c_m[0,0] / (c_m[0,0] + c_m[1,0])
recall = c_m[0,0] / (c_m[0,0] + c_m[0,1] )
print("precision " , precision*100)
print("recall ", recall*100)

precision  89.8406374501992
recall  95.14767932489451


So by using this model we could able to predict which news is real with an accuracy of 92%  