# Import Libraries

In [26]:
import nltk
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Read the dataset

In [27]:
df = pd.read_csv('train.csv')

# EDA

In [28]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [30]:
df.nunique()

id        20800
title     19803
author     4201
text      20386
label         2
dtype: int64

In [31]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [32]:
df['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [33]:
df.fillna(' ', inplace=True)

In [34]:
df['content'] = df['title'] + ' ' + df['text']

In [35]:
df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


# Splitting data into train and test

# Data cleaning and preprocessing

In [36]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ziadz\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [37]:
from sklearn.base import BaseEstimator, TransformerMixin
sett = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
port_stem = nltk.PorterStemmer()

class Preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def clean_text(sekf, text):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.strip()
        text = word_tokenize(text)
        text = [word for word in text if word not in sett]
        text = [lemmatizer.lemmatize(word) for word in text]
        text = ' '.join(text)
    
        return text
    
    
    def stemming(self, content):
        review = re.sub('[^a-zA-Z]',' ',content)
        review = review.lower()
        review = review.split()
        review = [port_stem.stem(word) for word in review if not word in sett]
        review = ' '.join(review)
        return review

    def transform(self, X, y=None):
        return X.apply(lambda text: self.stemming(self.clean_text(text)))

In [38]:
from sklearn.pipeline import Pipeline

content_pipeline = Pipeline([
        ('preprocess', Preprocessing()),
        ('tfidf', TfidfVectorizer())
])

In [39]:
prepapred_content = content_pipeline.fit_transform(df['content'])

X_train, X_test, y_train, y_test = train_test_split(prepapred_content, df['label'], test_size=0.2, random_state=42)

In [44]:
# save prepared data   
df.to_csv('prepared_data.csv', index=False)

In [1]:
# most common words

In [2]:
# word cloud

In [3]:
# lable frequency

# Machine learning models

**Logestic regression**

In [41]:
from sklearn.metrics import accuracy_score , ConfusionMatrixDisplay , classification_report , roc_curve

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train, y_train)

# evaluation
train_pred = LR.predict(X_train)
test_pred = LR.predict(X_test)

print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       0.98      0.97      0.97      8255
           1       0.97      0.98      0.97      8385

    accuracy                           0.97     16640
   macro avg       0.97      0.97      0.97     16640
weighted avg       0.97      0.97      0.97     16640

test score:               precision    recall  f1-score   support

           0       0.95      0.94      0.95      2132
           1       0.94      0.95      0.95      2028

    accuracy                           0.95      4160
   macro avg       0.95      0.95      0.95      4160
weighted avg       0.95      0.95      0.95      4160



**Naive Bayes**

In [42]:
# NB model
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, y_train)
train_pred = NB.predict(X_train)
test_pred = NB.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       0.84      0.99      0.91      8255
           1       0.99      0.81      0.89      8385

    accuracy                           0.90     16640
   macro avg       0.91      0.90      0.90     16640
weighted avg       0.91      0.90      0.90     16640

test score:               precision    recall  f1-score   support

           0       0.80      0.99      0.88      2132
           1       0.98      0.73      0.84      2028

    accuracy                           0.86      4160
   macro avg       0.89      0.86      0.86      4160
weighted avg       0.89      0.86      0.86      4160



**Decision Tree**

In [43]:
# dt model
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
train_pred = DT.predict(X_train)
test_pred = DT.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8255
           1       1.00      1.00      1.00      8385

    accuracy                           1.00     16640
   macro avg       1.00      1.00      1.00     16640
weighted avg       1.00      1.00      1.00     16640

test score:               precision    recall  f1-score   support

           0       0.94      0.94      0.94      2132
           1       0.93      0.93      0.93      2028

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



**Random forest**

In [20]:
# rf model
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
train_pred = RF.predict(X_train)
test_pred = RF.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8255
           1       1.00      1.00      1.00      8385

    accuracy                           1.00     16640
   macro avg       1.00      1.00      1.00     16640
weighted avg       1.00      1.00      1.00     16640

test score:               precision    recall  f1-score   support

           0       0.90      0.95      0.93      2132
           1       0.95      0.89      0.92      2028

    accuracy                           0.92      4160
   macro avg       0.92      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



**Voting System**

In [24]:
from sklearn.ensemble import VotingClassifier


vot = VotingClassifier(
    estimators=[('Logestic Regression', LR), ('Naive Bayes', NB), ('Descision Tree', DT), ('Random Forest', RF)],
    voting='hard'
)

vot.fit(X_train, y_train)

In [89]:
train_prediction = vot.predict(X_train)
test_prediction = vot.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       0.98      0.24      0.39      8255
           1       0.57      0.99      0.73      8385

    accuracy                           0.62     16640
   macro avg       0.77      0.62      0.56     16640
weighted avg       0.77      0.62      0.56     16640

test score:               precision    recall  f1-score   support

           0       0.97      0.20      0.33      2132
           1       0.54      0.99      0.70      2028

    accuracy                           0.59      4160
   macro avg       0.75      0.60      0.52      4160
weighted avg       0.76      0.59      0.51      4160



In [90]:
# save the model
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(vot, f)

# save the tfidf
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# save the pipeline

with open('pipeline.pkl', 'wb') as f:
    pickle.dump(content_pipeline, f)
    
# save cleaned data
prepared_df.to_csv('prepared_data.csv', index=False)

In [103]:
# make a fake news and predict it
fake_news = 'The mystery surrounding The Third Reich and Nazi Germany is still a subject of debate between many observers. Some believe that Nazi Germany, under the control of Adolf Hitler, possessed supernatural powers, and largely employed pseudo-science during the 1933-1945 period. However, some also hold that the above belief is just a mere speculation without any proven fact. Over the years, researchers have searched extensively for answers to some of the more mysterious activities associated with Nazi Germany.Nazi Germany invaded Russia (formerly the USSR) during the Second World War on June 22, 1941. At the time, the German army progressed deep into Russian territory, gaining ground close to the capital Moscow, before the Russians could counter-attack, eventually driving the Nazis back.During the Nazi occupation in Russia, in 1942, the Nazis built a secret military base around the Arctic, code-named â€œSchatzgraberâ€ or â€œTreasure Hunter,â€ which was reportedly very instrumental in the war against Russia. The base was primarily used as a tactical weather station for planning the strategic movements of Nazi troops, warships and submarines. The base also housed eminent Nazi scientists, whom conducted many experiments to help progress a German win of the war. It was widely speculated at the time that the Nazis used the base to contact aliens or extraterrestrial beings. The controversial Ahnenerbe was even linked to the base. The Ahnenerbe was an institute in Nazi Germany. Responsible for researching archaeological and cultural history of the Aryan race, it is rumored to have had heavy occult influences. Founded on July 1, 1935, by Heinrich Himmler, Herman Wirth and Richard Walther DarrÃ©, the Ahnenerbe later conducted experiments and launched expeditions in attempts to prove that mythological Nordic populations had once ruled the world.However, the Nazis abandoned the base in 1944 â€“ a time when the Russian army began its offensive, pushing the Germans out of the country. According to a war-time story, supplies had dwindled to dangerously low levels, and the Nazi officers stationed at the base outpost were forced to kill and eat polar bear, which ultimately, was infected with trichinosis. This caused those stationed at the base to fall severely ill and eventually they required rescue by a German U-boat. Despite Russian authors telling the story of â€œTreasure Hunter,â€ some observers consider it a myth, doubting its existence.But Russian researchers have now announced that â€œTreasure Hunterâ€ has been discovered, saying the base is on the island of Alexandra Land in the Arctic Circle, located 620 miles from the North Pole.A senior researcher at the Russian Arctic National Park, Evgeny Ermolov said in a statement announcing the discovery : â€œBefore it was only known from written sources, but now we also have real proof.â€'
fake_news = pipeline.transform(pd.Series(fake_news))
fake_news = tfidf.transform(fake_news)
model.predict(fake_news)

array([1], dtype=int64)