In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import classification_report
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
import re
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv("D:\\NLP projects\\fake_and_real_news.csv")

In [3]:
df

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake
9896,Trump consults Republican senators on Fed chie...,Real
9897,Trump lawyers say judge lacks jurisdiction for...,Real
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake


In [4]:
df.isnull().sum()

Text     0
label    0
dtype: int64

In [5]:
df['numeric_label'] = df['label'].apply(lambda x :1 if x == 'Fake' else 0)

In [6]:
df

Unnamed: 0,Text,label,numeric_label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,1
1,U.S. conservative leader optimistic of common ...,Real,0
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,0
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,1
4,Democrats say Trump agrees to work on immigrat...,Real,0
...,...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake,1
9896,Trump consults Republican senators on Fed chie...,Real,0
9897,Trump lawyers say judge lacks jurisdiction for...,Real,0
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake,1


In [7]:
df['Text'][0]

' Top Trump Surrogate BRUTALLY Stabs Him In The Back: ‘He’s Pathetic’ (VIDEO) It s looking as though Republican presidential candidate Donald Trump is losing support even from within his own ranks. You know things are getting bad when even your top surrogates start turning against you, which is exactly what just happened on Fox News when Newt Gingrich called Trump  pathetic. Gingrich knows that Trump needs to keep his focus on Hillary Clinton if he even remotely wants to have a chance at defeating her. However, Trump has hurt feelings because many Republicans don t support his sexual assault against women have turned against him, including House Speaker Paul Ryan (R-WI). So, that has made Trump lash out as his own party.Gingrich said on Fox News: Look, first of all, let me just say about Trump, who I admire and I ve tried to help as much as I can. There s a big Trump and a little Trump. The little Trump is frankly pathetic. I mean, he s mad over not getting a phone call? Trump s referr

In [8]:
df['label'].value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [9]:
nlp = spacy.load('en_core_web_lg')

In [10]:
def preprocessing(text):
    doc = nlp(text)
    filtered_token = [token.text for token in doc if not token.is_stop and not token.is_punct]
    join_text = ' '.join(filtered_token).strip()
    lower_text = join_text.lower()
    return re.sub(r'[^A-Za-z]+',' ',lower_text)

In [11]:
text = preprocessing(' Top Trump Surrogate BRUTALLY Stabs Him In The Back: ‘He’s Pathetic’ (VIDEO) It s looking as though Republican presidential candidate 50 Donald Trump is losing support even from within his own ranks. You know things are getting bad when even your top surrogates start turning against you, which is exactly what just happened on Fox News when Newt Gingrich called Trump  pathetic. Gingrich knows that Trump needs to keep his focus on Hillary Clinton if he even remotely wants to have a chance at defeating her. However, Trump has hurt feelings because many Republicans don t support his sexual assault against women have turned against him, including House Speaker Paul Ryan (R-WI). So, that has made Trump lash out as his own party.Gingrich said on Fox News: Look, first of all, let me just say about Trump, who I admire and I ve tried to help as much as I can. There s a big Trump and a little Trump. The little Trump is frankly pathetic. I mean, he s mad over not getting a phone call? Trump s referring to the fact that Paul Ryan didn t call to congratulate him after the debate. Probably because he didn t win despite what Trump s ego tells him.Gingrich also added: Donald Trump has one opponent. Her name is Hillary Clinton. Her name is not Paul Ryan. It s not anybody else. Trump doesn t seem to realize that the person he should be mad at is himself because he truly is his own worst enemy. This will ultimately lead to his defeat and he will have no one to blame but himself.Watch here via Politico:Featured Photo by Joe Raedle/Getty Images')

In [12]:
text

'trump surrogate brutally stabs pathetic video s looking republican presidential candidate donald trump losing support ranks know things getting bad surrogates start turning exactly happened fox news newt gingrich called trump pathetic gingrich knows trump needs focus hillary clinton remotely wants chance defeating trump hurt feelings republicans don t support sexual assault women turned including house speaker paul ryan r wi trump lash party gingrich said fox news look let trump admire ve tried help s big trump little trump little trump frankly pathetic mean s mad getting phone trump s referring fact paul ryan didn t congratulate debate probably didn t win despite trump s ego tells gingrich added donald trump opponent hillary clinton paul ryan s anybody trump doesn t realize person mad truly worst enemy ultimately lead defeat blame watch politico featured photo joe raedle getty images'

In [13]:
df['updated_news_text'] = df['Text'].apply(preprocessing)

In [15]:
df

Unnamed: 0,Text,label,numeric_label,updated_news_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,1,trump surrogate brutally stabs pathetic video ...
1,U.S. conservative leader optimistic of common ...,Real,0,u s conservative leader optimistic common grou...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,0,trump proposes u s tax overhaul stirs concerns...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,1,court forces ohio allow millions illegally pur...
4,Democrats say Trump agrees to work on immigrat...,Real,0,democrats trump agrees work immigration bill w...
...,...,...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake,1,wikileaks admits screwing immensely twitter po...
9896,Trump consults Republican senators on Fed chie...,Real,0,trump consults republican senators fed chief c...
9897,Trump lawyers say judge lacks jurisdiction for...,Real,0,trump lawyers judge lacks jurisdiction defamat...
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake,1,watch right wing pastor falsely credits trump ...


In [16]:
df['updated_news_text'][0]

'trump surrogate brutally stabs pathetic video s looking republican presidential candidate donald trump losing support ranks know things getting bad surrogates start turning exactly happened fox news newt gingrich called trump pathetic gingrich knows trump needs focus hillary clinton remotely wants chance defeating trump hurt feelings republicans don t support sexual assault women turned including house speaker paul ryan r wi trump lash party gingrich said fox news look let trump admire ve tried help s big trump little trump little trump frankly pathetic mean s mad getting phone trump s referring fact paul ryan didn t congratulate debate probably didn t win despite trump s ego tells gingrich added donald trump opponent hillary clinton paul ryan s anybody trump doesn t realize person mad truly worst enemy ultimately lead defeat blame watch politico featured photo joe raedle getty images'

In [17]:
vectorizer = TfidfVectorizer(max_features=5000)

In [18]:
vector = vectorizer.fit_transform(df['updated_news_text'])

In [19]:
df['vector'] =   pd.Series(vector.toarray().tolist())

In [20]:
df.columns

Index(['Text', 'label', 'numeric_label', 'updated_news_text', 'vector'], dtype='object')

In [21]:
df

Unnamed: 0,Text,label,numeric_label,updated_news_text,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,1,trump surrogate brutally stabs pathetic video ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,U.S. conservative leader optimistic of common ...,Real,0,u s conservative leader optimistic common grou...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0897695320776..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,0,trump proposes u s tax overhaul stirs concerns...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,1,court forces ohio allow millions illegally pur...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Democrats say Trump agrees to work on immigrat...,Real,0,democrats trump agrees work immigration bill w...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0328982604648..."
...,...,...,...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake,1,wikileaks admits screwing immensely twitter po...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9896,Trump consults Republican senators on Fed chie...,Real,0,trump consults republican senators fed chief c...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9897,Trump lawyers say judge lacks jurisdiction for...,Real,0,trump lawyers judge lacks jurisdiction defamat...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake,1,watch right wing pastor falsely credits trump ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [23]:
df.columns

Index(['Text', 'label', 'numeric_label', 'updated_news_text', 'vector'], dtype='object')

In [24]:
x_train,x_test,y_train,y_test = train_test_split(df.vector,df.numeric_label,test_size=0.2,random_state=2022,
                                                 stratify=df.numeric_label)

In [25]:
x_train.shape,y_train.shape

((7920,), (7920,))

In [26]:
x_train

4073    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
202     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3748    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3876    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1732    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                              ...                        
2705    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
967     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
6476    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1357    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1195687885866...
7087    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: vector, Length: 7920, dtype: object

In [27]:
x_test

9223    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
5499    [0.0, 0.0, 0.0, 0.0, 0.0, 0.04214768347196643,...
5540    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
7586    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0463087655883...
7410    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                              ...                        
9282    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1200    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2942    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0417396215311...
3791    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
6087    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: vector, Length: 1980, dtype: object

In [28]:
y_train,y_test

(4073    0
 202     0
 3748    1
 3876    1
 1732    0
        ..
 2705    0
 967     1
 6476    1
 1357    1
 7087    1
 Name: numeric_label, Length: 7920, dtype: int64,
 9223    0
 5499    1
 5540    1
 7586    0
 7410    0
        ..
 9282    1
 1200    1
 2942    1
 3791    1
 6087    0
 Name: numeric_label, Length: 1980, dtype: int64)

In [29]:
x_train.ndim

1

In [30]:
x_train_2d = np.stack(x_train)

In [31]:
x_test_2d = np.stack(x_test)

In [32]:
x_train_2d.ndim

2

In [33]:
x_test_2d.ndim

2

In [34]:
x_train_2d

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
clf = GradientBoostingClassifier()
clf.fit(x_train_2d,y_train)

In [36]:
y_pred = clf.predict(x_test_2d)

In [37]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       980
           1       1.00      1.00      1.00      1000

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [38]:
test_news = [
    "Michigan governor denies misleading U.S. House on Flint water (Reuters) - Michigan Governor Rick Snyder denied Thursday that he had misled a U.S. House of Representatives committee last year over testimony on Flintâ€™s water crisis after lawmakers asked if his testimony had been contradicted by a witness in a court hearing. The House Oversight and Government Reform Committee wrote Snyder earlier Thursday asking him about published reports that one of his aides, Harvey Hollins, testified in a court hearing last week in Michigan that he had notified Snyder of an outbreak of Legionnairesâ€™ disease linked to the Flint water crisis in December 2015, rather than 2016 as Snyder had testified. â€œMy testimony was truthful and I stand by it,â€ Snyder told the committee in a letter, adding that his office has provided tens of thousands of pages of records to the committee and would continue to cooperate fully.  Last week, prosecutors in Michigan said Dr. Eden Wells, the stateâ€™s chief medical executive who already faced lesser charges, would become the sixth current or former official to face involuntary manslaughter charges in connection with the crisis. The charges stem from more than 80 cases of Legionnairesâ€™ disease and at least 12 deaths that were believed to be linked to the water in Flint after the city switched its source from Lake Huron to the Flint River in April 2014. Wells was among six current and former Michigan and Flint officials charged in June. The other five, including Michigan Health and Human Services Director Nick Lyon, were charged at the time with involuntary manslaughter",
    " WATCH: Fox News Host Loses Her Sh*t, Says Investigating Russia For Hacking Our Election Is Unpatriotic This woman is insane.In an incredibly disrespectful rant against President Obama and anyone else who supports investigating Russian interference in our election, Fox News host Jeanine Pirro said that anybody who is against Donald Trump is anti-American. Look, it s time to take sides,  she began.",
    " Sarah Palin Celebrates After White Man Who Pulled Gun On Black Protesters Goes Unpunished (VIDEO) Sarah Palin, one of the nigh-innumerable  deplorables  in Donald Trump s  basket,  almost outdid herself in terms of horribleness on Friday."
]

In [39]:
test_news_vecorts = vectorizer.transform(test_news)

In [48]:
cleaning_area = [preprocessing(n) for n in test_news]

In [51]:
convert_into_vector  = vectorizer.transform(cleaning_area)

In [52]:
clf.predict(convert_into_vector)

array([0, 1, 1], dtype=int64)

In [53]:
import joblib

In [54]:
joblib.dump(clf,'D:/NLP projects/model.pkl')

['D:/NLP projects/model.pkl']

In [55]:
df['vector']

0       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0897695320776...
2       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0328982604648...
                              ...                        
9895    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9896    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9897    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9898    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9899    [0.028934469882832443, 0.0, 0.0, 0.0, 0.0, 0.0...
Name: vector, Length: 9900, dtype: object

In [57]:
joblib.dump(vectorizer, "D:/NLP projects/vector.pkl")

['D:/NLP projects/vector.pkl']