# Background
The purpose of this to determine if a given a set of news title, whether we could determine if it
is a false new or not

# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('train.csv', index_col = False)
df = df.drop('id', axis = 1)
df.head()

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


# Data Cleaning (Removing empty text rows)

In [3]:
df.info()  # Text column has 20761 non-nulls which means 39 null columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 650.1+ KB


In [4]:
# Removing the null values in text column
df.dropna(axis = 0, subset=['text'], inplace = True) 

In [5]:
# Now the text column has no null values
df.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20761 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20203 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20761 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 811.0+ KB


# Train test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size = 0.25)

# Initializing Tfidvectorizer and performing fitting/transform on training set and transforming on test set

In [7]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

# Initializing PassiveAgressiveClassifier

In [8]:
pac=PassiveAggressiveClassifier(max_iter=57)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 96.42%


# Creating confusion Matrix

In [9]:
confusion_matrix(y_test,y_pred, labels=[0,1])

array([[2485,  108],
       [  78, 2520]], dtype=int64)

# Predicting texts from other sources

In [10]:
def predict_this(texts):
    texts = [texts] # Converting it to a text
    texts_formatted=tfidf_vectorizer.transform(texts)
    new_pred=pac.predict(texts_formatted)[0]
    if new_pred == 1:
        print('Real News')
    else:
        print('Fake News')

In [11]:
phrase = "JAKARTA: When archaeologist Candrian Attahiyat heard that the Jakarta city government was planning to widen the flood-prone Ciliwung River last year, he was immediately alarmed. Part of the river cuts through Jakarta’s heritage area and the normalisation project would see the capital’s main waterway broadened by up to 15m, threatening the few remaining sections of the 400-year-old perimeter walls built by the Dutch East India Company"
predict_this(phrase)

Real News


# Using the unseen dataset for prediction

In [12]:
df_unseen = pd.read_csv('test.csv')
df_unseen_label = pd.read_csv('unseen_labels.csv')
test_df = pd.concat([df_unseen,df_unseen_label], axis=1)
test_df.head()

Unnamed: 0,id,title,author,text,id.1,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",20800,0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,20801,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,20802,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",20803,1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,20804,1


In [13]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
 4   id      5200 non-null   int64 
 5   label   5200 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 243.9+ KB


In [14]:
test_df.dropna(axis = 0, subset=['text'], inplace = True)

In [15]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5193 entries, 0 to 5199
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5193 non-null   int64 
 1   title   5071 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
 4   id      5193 non-null   int64 
 5   label   5193 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 284.0+ KB


In [16]:
X = test_df['text']
y = test_df['label']

# Defining a function to predict the unseen dataset

In [17]:
def dataset_pred(X_test, y_test):
    dataset_test=tfidf_vectorizer.transform(X_test)
    dataset_pred=pac.predict(dataset_test)
    score=accuracy_score(y_test,dataset_pred)
    print(f'Accuracy: {round(score*100,2)}%')

# Predicting off the Unseen Dataset

In [18]:
dataset_pred(X,y)

Accuracy: 63.47%
