In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string

In [2]:
train = pd.read_csv('/kaggle/input/fake-news-classification/train (2).csv', sep=';')
test = pd.read_csv('/kaggle/input/fake-news-classification/test (1).csv', sep=';')
eval = pd.read_csv('/kaggle/input/fake-news-classification/evaluation.csv', sep=';')

In [3]:
train.head(1)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1


In [4]:
test.head(1)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,"Live from New York, it's a Trump-Clinton remat...",NEW YORK (Reuters) - Veteran actor and frequen...,1


In [5]:
eval.head(1)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,"Sanders back in U.S. Senate, blasts 'coloniali...",WASHINGTON (Reuters) - Democratic U.S. preside...,1


In [6]:
train.shape, test.shape, eval.shape

((24353, 4), (8117, 4), (8117, 4))

In [7]:
data = pd.concat([train, test, eval], axis=0)

In [8]:
data.shape

(40587, 4)

In [9]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [10]:
data.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [11]:
data.pop('Unnamed: 0')

0          0
1          1
2          2
3          3
4          4
        ... 
8112    8112
8113    8113
8114    8114
8115    8115
8116    8116
Name: Unnamed: 0, Length: 40587, dtype: int64

In [12]:
data.columns

Index(['title', 'text', 'label'], dtype='object')

In [13]:
data.head(1)

Unnamed: 0,title,text,label
0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1


In [14]:
def processText(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

data['title'] = data['title'].apply(processText)
data['text'] = data['text'].apply(processText)

In [15]:
data.head(5)

Unnamed: 0,title,text,label
0,palestinians switch off christmas lights in be...,ramallah west bank reuters palestinians s...,1
1,china says trump call with taiwan president wo...,beijing reuters u s president elect donal...,1
2,fail the trump organization s credit score w...,while the controversy over trump s personal ta...,0
3,zimbabwe military chief s china trip was norma...,beijing reuters a trip to beijing last wee...,1
4,the most uncourageous president ever receives ...,there has never been a more uncourageous perso...,0


In [16]:
y = data.pop('label')
X = data.copy()

In [17]:
X.head(2)

Unnamed: 0,title,text
0,palestinians switch off christmas lights in be...,ramallah west bank reuters palestinians s...
1,china says trump call with taiwan president wo...,beijing reuters u s president elect donal...


In [18]:
y.head(2)

0    1
1    1
Name: label, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [20]:
X_test.shape

(8118, 2)

In [21]:
X_train.shape

(32469, 2)

In [22]:
vectorizer = TfidfVectorizer()

x1 = data['title']
x2 = data['text']

x1 = vectorizer.fit_transform(x1)
x2 = vectorizer.fit_transform(x2)

In [23]:
print(x1)
print(x2)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 462777 stored elements and shape (40587, 21957)>
  Coords	Values
  (0, 15134)	0.28048603804808064
  (0, 20109)	0.0947399050053756
  (0, 796)	0.2296976525426388
  (0, 1856)	0.452850125517141
  (0, 9768)	0.23423121676979106
  (0, 11319)	0.3862363283425796
  (0, 3382)	0.32017270978871587
  (0, 13416)	0.23936559392075724
  (0, 19123)	0.40419389524091154
  (0, 13857)	0.35678531956577586
  (1, 18539)	0.4216083059177137
  (1, 10277)	0.42301839619671605
  (1, 3169)	0.3259489800781733
  (1, 21630)	0.30027763207752456
  (1, 14858)	0.23450032476667315
  (1, 19197)	0.3778105737844253
  (1, 21590)	0.18550428684481954
  (1, 2786)	0.3156293330893364
  (1, 16977)	0.18388110740091607
  (1, 3329)	0.2617670019370966
  (1, 20109)	0.11310749592722615
  (2, 11014)	0.3917276089634742
  (2, 21834)	0.23410664639395265
  (2, 11733)	0.2726642206124024
  (2, 21522)	0.21104879383573125
  :	:
  (40584, 20938)	0.1731570600742336
  (40584, 13499)	0.1422005