In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
df = pd.read_csv('/content/fake_or_real_news.csv',on_bad_lines='skip',index_col='Unnamed: 0')
df.sample(5)

Unnamed: 0,title,text,label
7742,Trick-Or-Treaters Get Their Socks Rocked By BA...,Trick-Or-Treaters Get Their Socks Rocked By BA...,FAKE
8240,Democrats Playing Class Card To Split the Whit...,,FAKE
8004,"World Proud Of Its Calm, Measured Response To ...",0 Add Comment \nIN THE immediate aftermath of ...,FAKE
5574,Is America On The Brink Of Civil War?,Is America On The Brink Of Civil War? 11/07/20...,FAKE
5604,Benny Morris’s Untenable Denial of the Ethnic ...,References The Debate \nIt started when Daniel...,FAKE


In [4]:
df.shape

(6335, 3)

In [5]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
label,0


In [6]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
df['content'] = df['title'] + ' ' + df['text']

In [8]:
df.drop(columns=['title','text'],inplace=True,axis=1)

In [9]:
import re
def preprocess(content):
    # lower casse
    content = content.str.lower()
    # removing numbers and special characters
    content = content.apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x))
    # tokenizing
    content = content.apply(word_tokenize)
    # removing stop words
    stop_words = set(stopwords.words('english'))
    content = content.apply(lambda x: [word for word in x if word not in stop_words])
    # lemminizor
    lemmatizer = WordNetLemmatizer()
    content = content.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    # joining back
    content = content.apply(lambda x: ' '.join(x))

    return content

#label encoding outcome

In [10]:
label = LabelEncoder()
df['label'] = label.fit_transform(df['label'])

In [25]:
label.classes_

array(['FAKE', 'REAL'], dtype=object)

# model training

In [11]:
X_train,X_test,y_train,y_test = train_test_split(df['content'],df['label'],test_size=0.2,random_state=42)
print(X_train.shape,X_test.shape)

(5068,) (1267,)


In [12]:
y_train.shape

(5068,)

In [13]:
model = LogisticRegression(class_weight='balanced')
pipe = Pipeline([
    ('preprocess', FunctionTransformer(preprocess)),
    ('vect', TfidfVectorizer(max_features=109)),
    ('model',model)
])
pipe.fit(X_train,y_train)

In [14]:
y_pred = pipe.predict(X_test)
accuracy_score(y_test,y_pred)

0.8382004735595896

In [15]:
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(f1_score(y_test,y_pred))

0.8338461538461538
0.8482003129890454
0.8409619860356866


In [16]:
confusion_matrix(y_test,y_pred)

array([[520, 108],
       [ 97, 542]])

In [17]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=3,scoring='accuracy').mean()

np.float64(0.8218236576618869)

In [37]:
# title = input('title : ')
# text = input('text : ')
# model_input = title + ' ' + text
# model_prediction = pipe.predict(pd.Series([model_input]))
model_prediction = pipe.predict(pd.Series(df['content'].iloc[3608]))
if model_prediction == 0:
    print('Fake news')
else:
    print('Real news')

Real news
