In [1]:
import pandas as pd

df = pd.read_csv('Fake_Real_Data.csv')

#Taking smaller dataset for this prob
df1 = df.sample(2000)
df1.head(6)

Unnamed: 0,Text,label
2972,"BREAKING: Trump Goes Full Racist AGAIN, Pardo...",Fake
5382,WATCH: Joe Scarborough Fears Trump Will Murde...,Fake
3799,Senate Republican bill would slash Medicaid by...,Real
3178,REPORT: Trump Illegally Used Charity Money To...,Fake
1915,GOP Billionaire Issues HARSH Message To Congr...,Fake
3833,Trump Has Extremely Inappropriate Reaction To...,Fake


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df1['label_new'] = le.fit_transform(df1['label'])


In [3]:
df1.head()

Unnamed: 0,Text,label,label_new
2972,"BREAKING: Trump Goes Full Racist AGAIN, Pardo...",Fake,0
5382,WATCH: Joe Scarborough Fears Trump Will Murde...,Fake,0
3799,Senate Republican bill would slash Medicaid by...,Real,1
3178,REPORT: Trump Illegally Used Charity Money To...,Fake,0
1915,GOP Billionaire Issues HARSH Message To Congr...,Fake,0


In [12]:
df1['Text'][:2]

2972     BREAKING: Trump Goes Full Racist AGAIN, Pardo...
5382     WATCH: Joe Scarborough Fears Trump Will Murde...
Name: Text, dtype: object

In [4]:
!pip install -U spacy
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Convert text to preprocessed text

In [17]:
import spacy

# Load SpaCy model for preprocessing
nlp = spacy.load("en_core_web_sm")

#Function to tokenize text
def preprocess(text):
  doc = nlp(str(text))
  sent_vec = []
  for token in doc:
    if token.is_punct or token.like_num:
     continue
    sent_vec.append(token.lemma_)
  return sent_vec

In [18]:
df1['preprocessed_text'] = df1['Text'].apply(preprocess)
df1.head()

Unnamed: 0,Text,label,label_new,preprocessed_text
2972,"BREAKING: Trump Goes Full Racist AGAIN, Pardo...",Fake,0,"[ , breaking, trump, go, full, Racist, again, ..."
5382,WATCH: Joe Scarborough Fears Trump Will Murde...,Fake,0,"[ , WATCH, Joe, Scarborough, Fears, Trump, wil..."
3799,Senate Republican bill would slash Medicaid by...,Real,1,"[Senate, republican, bill, would, slash, Medic..."
3178,REPORT: Trump Illegally Used Charity Money To...,Fake,0,"[ , REPORT, Trump, illegally, use, Charity, mo..."
1915,GOP Billionaire Issues HARSH Message To Congr...,Fake,0,"[ , GOP, Billionaire, Issues, HARSH, Message, ..."


Train Word2vec model

In [19]:
# Train Word2Vec model on the processed text
import gensim
from gensim.models import Word2Vec

word2vec_model = Word2Vec(df1['preprocessed_text'], vector_size=100, window=5, min_count=1, workers=4)

Vectorize text using newly trained Word2Vec model

In [37]:
def convertToVector(token_list):
  vector = []
  for token in token_list:
    if token in word2vec_model.wv:
      vector.append(word2vec_model.wv[token])
  return sum(vector)/len(vector)

In [21]:
df1['text_vector'] = df1['preprocessed_text'].apply(convertToVector)
df1.head()

Unnamed: 0,Text,label,label_new,preprocessed_text,text_vector
2972,"BREAKING: Trump Goes Full Racist AGAIN, Pardo...",Fake,0,"[ , breaking, trump, go, full, Racist, again, ...","[-0.20526181, 0.05755636, 0.14729354, 0.292341..."
5382,WATCH: Joe Scarborough Fears Trump Will Murde...,Fake,0,"[ , WATCH, Joe, Scarborough, Fears, Trump, wil...","[-0.12167472, -0.0052469666, 0.14047386, 0.367..."
3799,Senate Republican bill would slash Medicaid by...,Real,1,"[Senate, republican, bill, would, slash, Medic...","[-0.422389, -0.049048804, 0.3707013, 0.2494568..."
3178,REPORT: Trump Illegally Used Charity Money To...,Fake,0,"[ , REPORT, Trump, illegally, use, Charity, mo...","[-0.3489501, 0.030130167, 0.25365508, 0.277469..."
1915,GOP Billionaire Issues HARSH Message To Congr...,Fake,0,"[ , GOP, Billionaire, Issues, HARSH, Message, ...","[-0.09433733, -0.013796017, 0.12235078, 0.3992..."


Model Training

In [24]:
from sklearn.model_selection import train_test_split


#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df1.text_vector.values,
    df1.label_new,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df1.label_new
)

In [25]:
import numpy as np

#Reshape X Train and y train
print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

Shape of X_train before reshaping:  (1600,)
Shape of X_test before reshaping:  (400,)
Shape of X_train after reshaping:  (1600, 100)
Shape of X_test after reshaping:  (400, 100)


In [26]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

#1. creating a GradientBoosting model object
clf = GradientBoostingClassifier()

#2. fit with all_train_embeddings and y_train
clf.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       200
           1       0.99      0.99      0.99       188
           2       1.00      1.00      1.00        12

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400



Test the model

In [38]:
test_news = [
    "Michigan governor denies misleading U.S. House on Flint water (Reuters) - Michigan Governor Rick Snyder denied Thursday that he had misled a U.S. House of Representatives committee last year over testimony on Flintâ€™s water crisis after lawmakers asked if his testimony had been contradicted by a witness in a court hearing. The House Oversight and Government Reform Committee wrote Snyder earlier Thursday asking him about published reports that one of his aides, Harvey Hollins, testified in a court hearing last week in Michigan that he had notified Snyder of an outbreak of Legionnairesâ€™ disease linked to the Flint water crisis in December 2015, rather than 2016 as Snyder had testified. â€œMy testimony was truthful and I stand by it,â€ Snyder told the committee in a letter, adding that his office has provided tens of thousands of pages of records to the committee and would continue to cooperate fully.  Last week, prosecutors in Michigan said Dr. Eden Wells, the stateâ€™s chief medical executive who already faced lesser charges, would become the sixth current or former official to face involuntary manslaughter charges in connection with the crisis. The charges stem from more than 80 cases of Legionnairesâ€™ disease and at least 12 deaths that were believed to be linked to the water in Flint after the city switched its source from Lake Huron to the Flint River in April 2014. Wells was among six current and former Michigan and Flint officials charged in June. The other five, including Michigan Health and Human Services Director Nick Lyon, were charged at the time with involuntary manslaughter",
    " WATCH: Fox News Host Loses Her Sh*t, Says Investigating Russia For Hacking Our Election Is Unpatriotic This woman is insane.In an incredibly disrespectful rant against President Obama and anyone else who supports investigating Russian interference in our election, Fox News host Jeanine Pirro said that anybody who is against Donald Trump is anti-American. Look, it s time to take sides,  she began.",
    " Sarah Palin Celebrates After White Man Who Pulled Gun On Black Protesters Goes Unpunished (VIDEO) Sarah Palin, one of the nigh-innumerable  deplorables  in Donald Trump s  basket,  almost outdid herself in terms of horribleness on Friday."
]
vec_test = []
for text in test_news:
  preprocessed_test = preprocess(text)
  print(preprocessed_test)
  vec_test.append(convertToVector(preprocessed_test))


clf.predict(vec_test)

['Michigan', 'governor', 'deny', 'mislead', 'U.S.', 'House', 'on', 'Flint', 'water', 'Reuters', 'Michigan', 'Governor', 'Rick', 'Snyder', 'deny', 'Thursday', 'that', 'he', 'have', 'mislead', 'a', 'U.S.', 'House', 'of', 'Representatives', 'committee', 'last', 'year', 'over', 'testimony', 'on', 'Flintâ€', '™', 's', 'water', 'crisis', 'after', 'lawmaker', 'ask', 'if', 'his', 'testimony', 'have', 'be', 'contradict', 'by', 'a', 'witness', 'in', 'a', 'court', 'hearing', 'the', 'House', 'Oversight', 'and', 'Government', 'Reform', 'Committee', 'write', 'Snyder', 'early', 'Thursday', 'ask', 'he', 'about', 'publish', 'report', 'that', 'of', 'his', 'aide', 'Harvey', 'Hollins', 'testify', 'in', 'a', 'court', 'hearing', 'last', 'week', 'in', 'Michigan', 'that', 'he', 'have', 'notify', 'Snyder', 'of', 'an', 'outbreak', 'of', 'Legionnairesâ€', '™', 'disease', 'link', 'to', 'the', 'Flint', 'water', 'crisis', 'in', 'December', 'rather', 'than', 'as', 'Snyder', 'have', 'testify', 'â€œMy', 'testimony', '

array([1, 0, 1])