In [None]:
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df= pd.read_csv('/content/drive/MyDrive/train.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Analysis and Enhancement of corpus

In [None]:
df['author'] = df['author'].fillna(" ")
df['text'] = df['text'].fillna(" ") 
df['title'] = df['title'].fillna(" ") 
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
df['author'].value_counts()

                                     1957
Pam Key                               243
admin                                 193
Jerome Hudson                         166
Charlie Spiering                      141
                                     ... 
Louise Story                            1
Maximus Decimus Meridius                1
Marc Santora and Samantha Schmidt       1
Marianne Rohrlich                       1
A Jew (UID 73270427)                    1
Name: author, Length: 4202, dtype: int64

In [None]:
df.groupby('label').describe()
# data is evenly distributed, so we do not need to balance it.

Unnamed: 0_level_0,id,id,id,id,id,id,id,id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,10387.0,10392.644171,5982.025154,1.0,5225.5,10396.0,15565.5,20797.0
1,10413.0,10406.338711,6027.288133,0.0,5159.0,10403.0,15629.0,20799.0


In [None]:
#seprating the label and concatenating the title, text and author. 
df.drop('label',axis=1)
df['news_data'] = df['title'] + " " + df['author'] + " " + df['text']

In [None]:
#analysing the contcatenated data for lstm
# the mean value of news data will be used later
txt = [text for text in df.news_data]
max_len_txt = 0
txt_len = []
for text in txt:
    txt_len.append(len(text.split()))
    max_len_txt = max(len(text.split()), max_len_txt)

print('Max length of the fake_news_text:', max_len_txt)
print('Mean length of the fake_news_text:', np.mean(txt_len))

Max length of the fake_news_text: 24245
Mean length of the fake_news_text: 773.1286057692307


In [None]:
y=df['label']

#preprocessing

In [None]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
porter=PorterStemmer()
stop_word=stopwords.words('english')
tokenized_sents=[]
def preprocessing(text):
  data=[]
  for i in text:
    sent=re.sub(r'[^a-zA-Z]', ' ', str(i))
    sent=sent.lower()
    sent_token=sent.split()
    sent_stem=[]
    for j in sent_token:
      if j not in stop_word:
        sent_stem.append(j)
    tokenized_sents.append(sent_stem)
    sent=' '.join(sent_stem)
    data.append(sent)
  return data

data=preprocessing(df['news_data'])
print("clean data")
print(data[:10000])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#Tfidfvector

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer()
x = v.fit_transform(data)

#Dataset division

In [None]:
# dividing the data into test and train set

from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test=train_test_split(x,y,test_size=0.3, random_state=0)

In [None]:
dict_models={}
dict_accuracy={}

#Model-1, MultinomialNB

In [None]:
#model-1
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

classifier_mnb=MultinomialNB()
classifier_mnb.fit(x_train, y_train)
pred = classifier_mnb.predict(x_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
print()
print("accuracy_matrix")
print(confusion_matrix(y_test,pred))
print()
print(f"Classification Report : \n\n{classification_report(y_test, pred)}")
dict_models["MultinomialNB"]=classifier_mnb
dict_accuracy[score]="MultinomialNB"

accuracy:   0.871

accuracy_matrix
[[3055   29]
 [ 775 2381]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.80      0.99      0.88      3084
           1       0.99      0.75      0.86      3156

    accuracy                           0.87      6240
   macro avg       0.89      0.87      0.87      6240
weighted avg       0.89      0.87      0.87      6240



#Tuning of MultinomialNB

In [None]:
## tuning with different values of k, also know as add k smoothing
import numpy as np
pre_score=0
best_k=0
mnb_tuned_classfier=MultinomialNB()
for a in np.arange(0, 2, 0.1):
  classifier=MultinomialNB(alpha=a)
  classifier.fit(x_train,y_train)
  pred = classifier.predict(x_test)
  score = accuracy_score(y_test, pred)
  if(score>pre_score):
    mnb_tuned_classifier=classifier
    pre_score=score
    best_k=a

mnb_tuned_classifier.fit(x_train,y_train)
pred = mnb_tuned_classifier.predict(x_test)
score = accuracy_score(y_test, pred)
print("accuracy: %0.3f" %score)
print("best_k: %0.3f" % best_k)
print()
print("accuracy_matrix")
print(confusion_matrix(y_test,pred))
print()
print(f"Classification Report : \n\n{classification_report(y_test, pred)}")
dict_models["Tuned_MultinomialNB"]=classifier_mnb
dict_accuracy[score]="Tuned_MultinomialNB"

  % _ALPHA_MIN


accuracy: 0.921
best_k: 0.100

accuracy_matrix
[[2978  106]
 [ 387 2769]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      3084
           1       0.96      0.88      0.92      3156

    accuracy                           0.92      6240
   macro avg       0.92      0.92      0.92      6240
weighted avg       0.92      0.92      0.92      6240



#Model-3 PassiveAggresive

In [None]:
#model 3

from sklearn.linear_model import PassiveAggressiveClassifier
model_pa = PassiveAggressiveClassifier(C = 0.6, random_state = 5)
model_pa.fit(x_train, y_train)
test_pred = model_pa.predict(x_test)
score = accuracy_score(y_test, test_pred)
print(f"Tuned Test Set Accuracy : {accuracy_score(y_test, test_pred) * 100} %\n\n")  
print()
print("accuracy_matrix")
print(confusion_matrix(y_test,test_pred))
print()
print(f"Classification Report : \n\n{classification_report(y_test, test_pred)}")
dict_models["passive_aggresive"]=model_pa
dict_accuracy[score]="passive_aggresive"


Test Set Accuracy : 97.25961538461539 %



accuracy_matrix
[[2977  107]
 [  64 3092]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      3084
           1       0.97      0.98      0.97      3156

    accuracy                           0.97      6240
   macro avg       0.97      0.97      0.97      6240
weighted avg       0.97      0.97      0.97      6240



# Model-3, Decision Tree


In [None]:
#model-3
#decission tree
from sklearn.tree import DecisionTreeClassifier
dt_classifier=DecisionTreeClassifier()
dt_classifier.fit(x_train,y_train)
test_pred = dt_classifier.predict(x_test)
score=accuracy_score(y_test, test_pred)
print(f"Test Set Accuracy : {accuracy_score(y_test, test_pred) * 100} %\n\n")
print()
print("accuracy_matrix")
print(confusion_matrix(y_test,test_pred))
print()
print(f"Classification Report : \n\n{classification_report(y_test, test_pred)}")
  
#dict_models["DecisionTree"]=dt_classifier
#dict_accuracy[score]="DecisionTree"

#Model-4, RANDOM FOREST


In [None]:

from sklearn.ensemble import RandomForestClassifier
clf_random= RandomForestClassifier()
clf_random.fit(x_train, y_train)
pred = clf_random.predict(x_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
print()
print("accuracy_matrix")
print(confusion_matrix(y_test,pred))
print()
print(f"Classification Report : \n\n{classification_report(y_test, pred)}")

dict_models["RandomForestClassifier"]=clf_random
dict_accuracy[score]="RandomForestClassifier"

accuracy:   0.941

accuracy_matrix
[[2963  121]
 [ 247 2909]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      3084
           1       0.96      0.92      0.94      3156

    accuracy                           0.94      6240
   macro avg       0.94      0.94      0.94      6240
weighted avg       0.94      0.94      0.94      6240



#Model-5, LSTM


In [None]:
# we use one hot vector for obtaining the positional embeddings.
# we have conidered sequential model to help the model learn context and produce better results
from tensorflow.keras.preprocessing.text import one_hot
vocab_size=5000
onehot_rep=[one_hot(w,vocab_size) for w in data]


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.layers import LSTM,Bidirectional, Dropout
embd=pad_sequences(onehot_rep,padding='pre', maxlen=650)
embd.shape

(20800, 650)

In [None]:
#creating model
model = Sequential()
model.add(Embedding(vocab_size, 40, input_length=650))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 650, 40)           200000    
                                                                 
 dropout_5 (Dropout)         (None, 650, 40)           0         
                                                                 
 lstm_2 (LSTM)               (None, 50)                18200     
                                                                 
 dropout_6 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                3264      
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                

In [None]:
x_final=np.array(embd)
y_final=np.array(y)
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.2)
model.fit(x_train,y_train,epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc45e3fe310>

In [None]:
y_pred=(model.predict(x_test)>=0.5).astype(int)
print(accuracy_score(y_test,y_pred))
print()
print("accuracy_matrix")
print(confusion_matrix(y_test,y_pred))
print()
print(f"Classification Report : \n\n{classification_report(y_test, y_pred)}")
dict_models["lstm"]=model
dict_accuracy[score]="lstm"

0.9447115384615384

accuracy_matrix
[[1974  129]
 [ 101 1956]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      2103
           1       0.94      0.95      0.94      2057

    accuracy                           0.94      4160
   macro avg       0.94      0.94      0.94      4160
weighted avg       0.94      0.94      0.94      4160



#finding model with highest accuracy

In [None]:
keys=dict_accuracy.keys()
max_key=max(keys)
print(keys)
model_name=dict_accuracy[max_key]
print(model_name)
model=dict_models[model_name]

dict_keys([0.8711538461538462, 0.9209935897435897, 0.9725961538461538, 0.9625, 0.941025641025641])
passive_aggresive


In [None]:
print(model)

PassiveAggressiveClassifier(C=0.6, random_state=5)


#saving the model and TF idf vector for deployment

In [None]:
 import pickle
pickle.dump(model, open('/content/drive/MyDrive/models/model.pickle', 'wb'))
pickle.dump(v,open('/content/drive/MyDrive/models/tfidf.pickle', 'wb'))
