## 1. Importing necessary packages

In [1]:
import numpy as np 
import pandas as pd
import itertools    # memory-efficient tools for working with iterators

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

import pickle

## 2. Load the data into a DataFrame

In [3]:
# Read the data into a Dataframe
df = pd.read_csv('news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
df.shape

(6335, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [62]:
# Extract the `text` column
df.text[10]

'cedar rapid iowa one wonder ralli entir career right bill clinton said way open crowd saturday night two day iowa caucus cedar rapid tri deliv old feel wife hillari clinton crowd one woman held sign said year men turn carri sign book other travel far missouri wait hour even fire marshal told room insid high school gymnasium restiv crowd chant slogan buzz anticip final bill hillari chelsea clinton appear stage hand hand hour behind schedul roar hillari clinton beam long slog iowa clinton campaign struggl mightili shake label support muster enthusiasm rival backer caucus near help former presid energi level event notabl dial charismat speaker said cigi ross general say bigger draw peopl monday night put campaign month work test campaign organ bring support candid energ voter clinton seem draw higher usual energi stood center deliv confid close statement need plan commit clinton said top voic yes thank clinton finish eight year later clinton iowa face could nail bite conclus hard fought 

## Applying a Stemmer to text data

In [12]:
# Initialize the Snowball stemmer for english
stemmer = SnowballStemmer("english")

# sample text
text = ['Fake news is a type of yellow journalism or propaganda that consists of deliberate misinformation or hoaxes spread via traditional news media or online social media.']

# Tokenize the text
words = text[0].split()
stems = [stemmer.stem(word) for word in words]
print(stems)

['fake', 'news', 'is', 'a', 'type', 'of', 'yellow', 'journal', 'or', 'propaganda', 'that', 'consist', 'of', 'deliber', 'misinform', 'or', 'hoax', 'spread', 'via', 'tradit', 'news', 'media', 'or', 'onlin', 'social', 'media.']


In [14]:
# The stemmer on the text data
def stemming(content):
    con = re.sub('[^a-zA-Z]', ' ', content)
    con = con.lower()
    con = con.split()
    # Apply the Snowball stemmer and filter out the stopwords
    con = [stemmer.stem(word) for word in con if not word in stopwords.words('english')]
    stems = ' '.join(con)

    return stems

In [16]:
stemming('Daniel Greenfield, a Shillman Journalism Fellow at the Freedom Center, is a New York writer focusing on radical Islam.')

'daniel greenfield shillman journal fellow freedom center new york writer focus radic islam'

In [18]:
# Apply the stemmer on the text column in dataset
df['text'] = df['text'].apply(stemming)

In [19]:
df['text']

0       daniel greenfield shillman journal fellow free...
1       googl pinterest digg linkedin reddit stumbleup...
2       u secretari state john f kerri said monday sto...
3       kayde king kaydeek novemb lesson tonight dem l...
4       primari day new york front runner hillari clin...
                              ...                        
6330    state depart told republican nation committe c...
6331    p pbs stand plutocrat pentagon post oct wikime...
6332    anti trump protest tool oligarchi reform alway...
6333    addi ababa ethiopia presid obama conven meet l...
6334    jeb bush sudden attack trump matter jeb bush p...
Name: text, Length: 6335, dtype: object

### Get the labels from the DataFrame

In [22]:
# Get labels
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [230]:
#labels = labels.map({'FAKE': 0, 'REAL': 1}) 
#labels.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6335 entries, 0 to 6334
Series name: label
Non-Null Count  Dtype
--------------  -----
6335 non-null   int64
dtypes: int64(1)
memory usage: 49.6 KB


## 3. Split the dataset into training and testing sets

In [24]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=7)

In [234]:
# Join the tokens back into single strings
#X_train = [' '.join(doc) for doc in X_train]
#X_test = [' '.join(doc) for doc in X_test]

In [26]:
X_train

6237    head lead survivalist group made sever shock a...
3722    arnaldo rodger train educ psychologist work co...
5774    patti sanchez use eat calori day weigh kilogra...
336     benjamin netanyahu reelect regard apathi mani ...
3622    john kasich kill iowa voter banter larri coral...
                              ...                        
5699                                                     
2550    american elect wealthi presid polit expert say...
537     anyon write sentenc like nevertheless fuel per...
1220    cathol congress ever posit power howev pope fr...
4271    host cnn present bombast stupid almost condesc...
Name: text, Length: 5068, dtype: object

## 4. Feature Engineering on text data

In [28]:
# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train and test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

## 5. Model Training

In [30]:
# Initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

# Predict on the test set
y_pred = pac.predict(tfidf_test)

In [32]:
y_pred

array(['REAL', 'FAKE', 'REAL', ..., 'REAL', 'FAKE', 'REAL'], dtype='<U4')

## 6. Evaluate the Model

In [34]:
score = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {round(score*100, 2)}%")

Accuracy score: 92.34%


In [40]:
# Bulid a confusion matrix to evaluate the model
confusion_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
confusion_matrix

array([[585,  53],
       [ 44, 585]])

## Save the Models

In [42]:
pickle.dump(tfidf_vectorizer, open('vector.pkl', 'wb'))
pickle.dump(pac, open('pac_model.pkl', 'wb'))

In [44]:
# Check the model and vector if they work
load_model = pickle.load(open('pac_model.pkl', 'rb'))
vector_load = pickle.load(open('vector.pkl', 'rb'))

In [46]:
def fake_news_detect(news):
    news = stemming(news)
    input_data = [news]
    vector_form1 = vector_load.transform(input_data)
    prediction = load_model.predict(vector_form1)
    return prediction

In [58]:
# Test the models
value = fake_news_detect("""kayde king kaydeek novemb lesson tonight dem loss time democrat start listen voter stop run establish candid peopl berni peopl berni novemb dem want tight race work berni walker bragman walkerbragman novemb new york time columnist paul krugman one hillari clinton outspoken surrog contenti democrat primari blame clinton poor perform green parti candid jill stein far receiv neglig number vote nation say stein ralph nader prevent clinton victori account berniesteach threw krugman analysi back face candid issu take respons https co khyouusrf teacher berni berniesteach novemb ana navarro republican recent endors hillari clinton sum preposter natur presidenti elect tweet gop nomin damn candid could lose hillari clinton democrat nomin damn candid could lose trump ana navarro ananavarro novemb popular left wing facebook page pro sander primari respond trump surg simpli post meme sander face text could avoid thank noth dnc meme share almost time less hour post tuesday novemb berni sander endors hillari clinton democrat nation convent juli mani support remain adam refus support dnc anoint candid point wikileak revel top offici dnc work behind scene tip scale clinton favor coordin media figur circul anti sander narrat rather attribut potenti trump presid gop nomine perceiv popular among voter close elect could credit hillari clinton unfavor rate accord realclearpolit anywher percent voter negat opinion democrat nomine pm eastern florida michigan pennsylvania wisconsin remain close call clinton elector vote trump zach cartwright activist author richmond virginia enjoy write polit govern media send email email protect""")
value

array(['FAKE'], dtype='<U4')

In [60]:
if value == 'FAKE':
    print('Fake news')
else:
    print('Real news')

Fake news
