### Load Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv("train.csv")
# Remove id column
train_data = train_data.drop(columns='id', axis=1)
train_data.head()

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 650.1+ KB


### Data Preprocessing

#### Imputing Null value

In [4]:
# Replacing null with ''
replace_null = lambda data : data.fillna('')

In [5]:
train_data = replace_null(train_data)

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20800 non-null  object
 1   author  20800 non-null  object
 2   text    20800 non-null  object
 3   label   20800 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 650.1+ KB


#### Convert string to input data (number)

In [7]:
# Download Stopword
import nltk, string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download("punkt")

stop_words = stopwords.words("English")
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to C:\Users\Vedro
[nltk_data]     Suwandi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vedro
[nltk_data]     Suwandi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def preprocess_sentence(sentence):
    # Split text into words
    remove_punkt = str.maketrans("", "", string.punctuation)
    texts = sentence.lower().translate(remove_punkt).split(" ")
    # Stem words and Remove Stopwords
    compact_text = [stemmer.stem(text) for text in texts if text not in stop_words]
    compact_sentence = " ".join(compact_text)
    return compact_sentence

In [9]:
train_data['news_title'] = train_data['author'] + ' ' + train_data['title']

In [10]:
# Use only author and title to be independent value 
x = train_data['news_title']
y = train_data['label']

In [11]:
x = x.apply(preprocess_sentence).values
y = np.array(y)

In [12]:
x.shape, y.shape

((20800,), (20800,))

#### Convert Sentence to number

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

vectorizer.fit(x)

x = vectorizer.transform(x)

In [14]:
print(x)

  (0, 18962)	0.274622582696215
  (0, 16424)	0.24736985662147531
  (0, 11118)	0.3474865261398189
  (0, 10793)	0.27918196056451583
  (0, 9768)	0.23687061504480295
  (0, 8945)	0.21145389557856892
  (0, 6520)	0.2229793986940987
  (0, 5395)	0.28591279407841685
  (0, 5152)	0.2593852460013794
  (0, 4933)	0.3439481092033905
  (0, 4179)	0.23590805789133468
  (0, 3643)	0.3513624233627056
  (0, 1039)	0.2602414069110504
  (1, 20264)	0.30016009356422996
  (1, 8727)	0.19531304773923222
  (1, 7217)	0.7112902110369504
  (1, 4899)	0.26261539165209385
  (1, 4002)	0.19126591879712415
  (1, 3351)	0.3869733920652198
  (1, 2979)	0.15457882185857713
  (1, 2529)	0.2943305171704031
  (2, 18877)	0.44040959009493647
  (2, 11908)	0.5180322171017542
  (2, 7771)	0.3647228419925426
  (2, 7077)	0.4115495283082121
  :	:
  (20797, 15153)	0.2726902050028717
  (20797, 14930)	0.24783313481055852
  (20797, 12698)	0.0803971909051498
  (20797, 11871)	0.1745890961540486
  (20797, 11796)	0.2954806779318824
  (20797, 11223)	0.3

#### Split Dataset

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.2, stratify=y, random_state=1)

### Model Training

#### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

log_regression = LogisticRegression()
log_regression.fit(x_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score
# get the score/accuracy
def get_accuracy(model, x, y):
    score = accuracy_score(y, model.predict(x))
    print("Accuracy : {:.2f}%".format(score * 100))

In [18]:
get_accuracy(log_regression, x_test, y_test)

Accuracy : 97.72%


#### SVM

In [19]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(x_train, y_train)

In [20]:
get_accuracy(svm, x_test, y_test)

Accuracy : 98.80%


#### Making Prediction with non existing data

In [21]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [22]:
test_data = test_data.drop(columns='id', axis=1)

In [23]:
# Replace NaN value with empty string ''
test_data = replace_null(test_data)

In [24]:
# join author and title
x_real_test = test_data['author'] + " " + test_data['title']

In [25]:
x_real_test = x_real_test.apply(preprocess_sentence).values

In [26]:
x_real_test = vectorizer.transform(x_real_test)

In [27]:
x_real_test.shape

(5200, 21660)

In [28]:
y_real_test = log_regression.predict(x_real_test)
y_real_test_svm = svm.predict(x_real_test)

#### Make a single prediction

In [30]:
def single_prediction(model, title):
    y_pred = model.predict(title)
    if(y_pred[0] == 0):
        return "This is Real News"
    else:
        return "This is Fake News"

In [31]:
print(single_prediction(log_regression, x_real_test[0]))

This is Real News


In [32]:
print(single_prediction(svm, x_real_test[0]))

This is Real News


#### 