In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import re #stands for "regular expression", useful for searching for text in a document
from nltk.corpus import stopwords # nltk= Natural Language(text we are going to deal with) ToolKit, corpus= body(important content) of the particular text.
#stopwords from the above line means words that dont add much value to the text(eg: "the", "or", even whole articles)
from nltk.stem.porter import PorterStemmer #"stems" our word. Gives the root word for our word
from sklearn.feature_extraction.text import TfidfVectorizer #used to convert text to feature vectors


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#printing the stopwords in english, these words will be "stemmed out"(removed) to simplify our input
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
#data collection and pre-processing
newsData=pd.read_csv('trainingData.csv') 

In [12]:
newsData.isnull().sum() #counts the number of missing values in each column

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [15]:
#replacing the null values with empty strings
newsData= newsData.fillna('') # '' represent the null strings, this function replaces missing values with a null string

         # important!!
# We will combine the title and author colums and will feed that combined colum to the model. We wont do this with the text colum because there is ALOT of data in this colum and will be computatively hard. We can do it tho

In [17]:
# merging the author and title cols
newsData['content'] = newsData['author']+' '+newsData['title']#new column named 'content' stores author and title concatenated

In [19]:
#  #Stemming:
# # stemming is the process of reducing a word to its root word.
# this is done to reduce words as much as possible to have a better performing model
# example:
#     the root word for actor, actress, acting is "act". the suffixes and prefixes are removed to get the root word

In [21]:
portStem = PorterStemmer()

In [23]:
# creating a function for stemming
def stemming(content):
    stemmedContent = re.sub('[^a-zA-Z]', ' ', content) # removes everything but the alphabets by replacing them with " ".
    stemmedContent = stemmedContent.lower() #convert all alphabets to lowercase as some models may precieve uppercase letters as significant characters.
    stemmedContent = stemmedContent.split() #all words being converted to a list
    stemmedContent = [portStem.stem(word) for word in stemmedContent if not word in stopwords.words('english')] #apply the PorterStemmer function to all words except the 
    stemmedContent = ' '.join(stemmedContent)
    return stemmedContent

In [25]:
 newsData['content']=newsData['content'].apply(stemming) #stemming the content column. Time consuming to run as this function is applied to every word.

In [26]:
print(newsData['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [27]:
#seperating the data and the label
X= newsData['content'].values #.values converts the data into a numpy array
y= newsData['label'].values

In [28]:
#converting textual data to numerical data
vectorizer=TfidfVectorizer() #checks how much a word repeats and assignes an "importance value" to it. It also handles the case where a repeating number has little to no significance/meaning
vectorizer.fit(X)
X=vectorizer.transform(X) #converts all the values to their respecitve feature vectors(the values assigned by the vectorizer)

#We dont need to do this with y as it is already in numeric;

In [29]:
Xtrain, Xtest, yTrain, yTest = train_test_split(X, y, test_size=0.2, stratify=y)


In [30]:
#training via Logistic Regression
model = LogisticRegression()


In [31]:
model.fit(Xtrain, yTrain)

In [32]:
#Evaluation
trainPred = model.predict(Xtrain)
trainScore= accuracy_score(trainPred, yTrain)
print(trainScore)

0.9865985576923076


In [33]:
testPred = model.predict(Xtest)
testScore= accuracy_score(testPred, yTest)
print(testScore)

0.9776442307692308


In [37]:
#Building a predictive system
X_new = Xtest[0] #using a new value from test data to make model predict. Using test data because model isnt trained on it
prediction= model.predict(X_new)
print(prediction)

if (prediction[0]==0):
    print("Real news")
else:
    print("Fake news")

[0]
Real news
