# Fake News Classifier

### Data Reading and Interpretation

__Import Statements__

In [95]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
from os import path
import re
import string
import altair as alt
import matplotlib.pyplot as plt

from collections import Counter

import nltk

# nltk.download('words')
# nltk.download("cmudict")
# nltk.download("vader_lexicon")
# nltk.download("punkt")
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.corpus import cmudict, stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
#from similarity.jarowinkler import JaroWinkler
#from similarity.cosine import Cosine
#from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import (
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    FunctionTransformer
)

from sklearn.impute import SimpleImputer

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from wordcloud import WordCloud

alt.data_transformers.enable('data_server')
# alt.renderers.enable('mimetype')

DataTransformerRegistry.enable('data_server')

In [2]:
train_df = pd.read_csv("../data/raw/train.csv")

### Data Cleaning 

- Deal NaNs/Nulls and empty texts

In [3]:
def remove_punctuation(word):
    punctuations = string.punctuation
    punctuations += "“”–\n"

    for element in word:
        if element in punctuations:
            word = word.replace(element, "")
    return word


def clean_data(text):
    text = str(text).lower()
    text = str(text).strip()
    text
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = remove_punctuation(text)
    return text

In [4]:
train_df["title"] = train_df["title"].apply(clean_data)

In [5]:
train_df["text"] = train_df["text"].apply(clean_data)

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.describe(include="all")

### EDA

In [None]:
train_df["label"].value_counts().plot.bar()

__There is no class imbalance__

In [6]:
fake = " ".join(train_df[train_df["label"] == 1]["text"])
real = " ".join(train_df[train_df["label"] == 0]["text"])

In [7]:
stop_words = stopwords.words("English")

In [None]:
# wordcloud_fake = WordCloud(
#     stopwords=stop_words,
#     max_font_size=40,
#     width=400,
#     height=200
# ).generate(fake).to_image()

# plt.imshow(wordcloud_fake)
# plt.axis("off")
# plt.show()

In [None]:
# wordcloud_real = WordCloud(
#     stopwords=stop_words,
#     max_font_size=40,
#     width=400,
#     height=200
# ).generate(real).to_image()

# plt.imshow(wordcloud_real)
# plt.axis("off")
# plt.show()

In [None]:
real_counter = Counter(word for word in real.split(" ") if word not in stop_words)

In [None]:
fake_counter = Counter(word for word in fake.split(" ") if word not in stop_words)

In [None]:
# fake[:5000]

In [None]:
# df_real = pd.DataFrame.from_dict(
#     real_counter,
#     orient="index",
#     columns=["count"]
# ).reset_index().sort_values("count", ascending=False).head(20)

# df_real

In [None]:
# df_fake = pd.DataFrame.from_dict(
#     fake_counter,
#     orient="index",
#     columns=["count"]
# ).reset_index().sort_values("count", ascending=False).head(20)

In [None]:
# alt.Chart(df_real, title="Frequency of top 10 words in Real News").mark_bar().encode(
#     x="index",
#     y="counts"
# )

In [None]:
# alt.Chart(df_fake, title="Frequency of top 10 words in Fake News").mark_bar().encode(
#     alt.X("index", sort="-y"),
#     alt.Y("count")
# )

# plot = sns.countplot(x="index", data = df_fake)
# plt.show()

### Preprocessing and Feature Engineering

#### Author Feature

In [None]:
# Create a feature "author_is_null"

# def author_is_null(x):
#     if x["author"] != x["author"]:
#         return 0
#     return 1

# train_df["author_is_null"] = train_df.apply(lambda x: author_is_null(x), axis=1)

In [None]:
# Change Author = Null to Author = "Unknown"

# unknown_authors_ids = train_df.query("author.isnull()")["id"]
# train_df['updated_author_column_name'] = np.where(~train_df['id'].isin(unknown_authors_ids), train_df['author'], 'Unknown')

In [None]:
# Others category if value_counts of an author is less than 5

# less_frequent = train_df['author'].value_counts()[train_df['author'].value_counts() <= 5].index.tolist()
# train_df['author'] = np.where(train_df['author'].isin(less_frequent), 'Other', train_df['author'])

In [None]:
# Create if "is_multiple_authors"

# def is_multiple_authors(data):

#     data["is_multiple_authors"] = [
#         1 if " and " in str(author) else 0 for author in data["author"]
#     ]

#     return data


# train_df = is_multiple_authors(train_df)

# train_df.query("is_multiple_authors == 1")["label"].value_counts()

In [None]:
# Check if author name contains a domain suffix

# def author_contains_domain(data):

#     data["author_contains_domain"] = [
#         1 if re.search(r"\.[a-zA-Z]{3}", str(author)) else 0 for author in train_df["author"]
#     ]

#     return data


# train_df = author_contains_domain(train_df)

# train_df.query("author_contains_domain == 1")["label"].value_counts()

#### Title Feature

In [None]:
# Check if title is null

# def is_title_null(data):

#     data["is_title_null"] = [
#         0 if title == title
#         else 1 for title in train_df["title"]
#     ]

#     return data


# train_df = is_title_null(train_df)

# train_df.query("is_title_null == 1")["label"].value_counts()

In [None]:
# Check if title ends with a famous journal name

# def title_contains_famous_journal(data):

#     data["title_contains_famous_journal"] = [
#         1 if
#         str(title).endswith("The New York Times") or
#         str(title).endswith("Breitbart")
#         else 0 for title in train_df["title"]
#     ]

#     return data


# train_df = title_contains_famous_journal(train_df)

# train_df.query("title_contains_famous_journal == 1")["label"].value_counts()

In [None]:
# def no_of_words(data):

#     data["no_of_words"] = [
#         len(str(title).split(" ")) for title in train_df["title"]
#     ]

#     return data


# train_df = no_of_words(train_df)

In [None]:
# alt.Chart(train_df).mark_bar().encode(
#     alt.X("no_of_words", bin=alt.Bin(maxbins=50)),
#     alt.Y("count()"),
#     color="label"
# )

In [None]:
# def no_of_chars(data):

#     data["no_of_chars"] = [
#         len(str(title)) for title in train_df["title"]
#     ]

#     return data


# train_df = no_of_chars(train_df)

In [None]:
# alt.Chart(train_df).mark_bar().encode(
#     alt.X("no_of_chars", bin=alt.Bin(maxbins=100)),
#     alt.Y("count()"),
#     color="label"
# )

In [None]:
# def get_text_length(text):
#     """
#     Returns the number of words in a text without punctuations. 
#     Counts clitics as separate words.

#     Parameters
#     ----------
#     text : str
#         A text from which we find the number of words

#     Returns
#     -------
#     An int which represents the number of words in the text
#     """
#     non_punc = []
#     for word in word_tokenize(str(text)):
#         if word not in string.punctuation:
#             non_punc.append(word)
#     return len(non_punc)

In [None]:
# train_df = train_df.assign(title_len=train_df["title"].apply(get_text_length))

In [None]:
# def get_lexical_density(text):
#     """
#     Returns the lexical density of a text. That is the ratio of open class words.
#     in the text

#     Parameters
#     ----------
#     text : str
#         A text from which we find the lexical density

#     Returns
#     -------
#     A float which represents the lexical density
#     """
#     open_class_prefix = {"N", "V", "J", "R"}
#     open_class_total = 0
#     word_count = 0
#     if len(str(text)) == 0:
#         return float(0)
#     for word, pos in pos_tag(word_tokenize(str(text))):
#         if word not in string.punctuation:
#             word_count += 1
#             if pos[0] in open_class_prefix:
#                 open_class_total += 1
#     return open_class_total/word_count

In [None]:
# train_df["title_lexical_density"] = train_df["title"].apply(get_lexical_density)

In [None]:
#train_df["text_lexical_density"] = train_df["text"].apply(get_lexical_density)

In [None]:
# def get_pos_count(text):
#     """
#     Counts the number of nouns, verbs and adjectives in a text.

#     Parameters
#     ----------
#     text : str
#         A text for which we find the number of nouns, verbs
#         and adjectives

#     Returns
#     -------
#     A tuple of (noun_count: int, verb_count: int, adj_count: int)
#     which represents the number of nouns, verbs adjectives in the text
#     respectively
#     """
#     noun_count = 0
#     verb_count = 0
#     adj_count = 0

#     if len(str(text)) == 0:
#         return 0, 0, 0

#     for word, pos in pos_tag(word_tokenize(str(text))):
#         if(pos[0] == 'N'):
#             noun_count += 1
#         if(pos[0] == 'V'):
#             verb_count += 1
#         if(pos == 'JJ'):
#             adj_count += 1
#     return noun_count
#     return verb_count
#     return adj_count

In [None]:
# train_df["count_pos_title"] = train_df["title"].apply(get_pos_count)

In [None]:
# train_df["count_noun_title"], train_df["count_verb_title"], train_df["count_adj_title"] = train_df["count_pos_title"].str[0],train_df["count_pos_title"].str[1],train_df["count_pos_title"].str[2] 

In [None]:
# def get_num_ovv_words(text):
#     """
#     Gets the number of out-of-vocabulary words in a text.

#     Parameters
#     ----------
#     text : str
#         A text for which we find the number of out-of-vocabulary
#         words is to be found

#     Returns
#     -------
#     The number of oov words in the text
#     """
#     text_vocab = set(w for w in text.split() if w.isalpha())
#     english_vocab = set(w for w in nltk.corpus.words.words())
#     ovv_words = text_vocab - english_vocab

#     return len(ovv_words)

In [None]:
# train_df["title_ovw"] = train_df["title"].apply(get_num_ovv_words)

In [None]:
# def contains_says(data):

#     data["contains_says"] = [
#         1 if
#         len(str(title).split(" ")) < 6

# #         re.search("[^a-zA-Z0-9 .,:'\"-\\$()]", str(title))

# #         re.search("[0-9]", str(title))

# #         "Says" in str(title) or "says" in str(title)

#         else 0 for title in train_df["title"]
#     ]

#     return data

In [None]:
# train_df = contains_says(train_df)

#### Text Feature

In [None]:
# # Check if text is empty

# def is_text_empty(data):

#     data["is_text_empty"] = [
#         1 if text == " " or
#         not text == text
#         else 0 for text in train_df["text"]
#     ]

#     return data
# train_df.query("text == ' '")["label"].value_counts()

In [None]:
#train_df = is_text_empty(train_df)

In [None]:
#train_df2.query("is_text_empty == 1")["label"].value_counts()

In [None]:
#train_df["text_len"] = train_df["text"].apply(get_text_length)

In [None]:
#train_df["text_ovw"] = train_df["text"].apply(get_num_ovv_words)

In [None]:
#train_df["text_pos_count"] = train_df["text"].apply(get_pos_count)
#train_df["count_noun_text"], train_df["count_verb_text"], train_df["count_adj_text"] = train_df["text_pos_count"].str[0],train_df["text_pos_count"].str[1],train_df["text_pos_count"].str[2] 

In [None]:
#train_df.to_csv("temp.csv")

#### Cosine Similarity between Title and Text

In [None]:
#cosine = Cosine(2)
#train_df["p0"] = train_df["title"].apply(lambda s: cosine.get_profile(s))
#train_df["p1"] = train_df["text"].apply(lambda s: cosine.get_profile(s))
# train_df["cosine_sim"] = [cosine.similarity_profiles(p0,p1) for p0, p1 in zip(train_df["p0"],train_df["p1"])]

# train_df.drop(["p0", "p1"], axis=1)

# jarowinkler = JaroWinkler()
# df["jarowinkler_sim"] = [jarowinkler.similarity(i,j) for i,j in zip(train_df["title"],train_df["text"])]

#score = cosine_similarity(train_df['title'], train_df['text'])

### Preprocessor

In [8]:
train_df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [9]:
train_df_small, val_df = train_test_split(train_df, test_size=0.2, random_state=123)

In [10]:
X_train, y_train = train_df_small.drop(columns=["label"]), train_df_small["label"]

In [37]:
X_train["text"] = X_train["text"].values.astype("U")

In [107]:
text_features1 = ["title"] 
text_features2 = ["text"]
pass_through = []
drop = ["id", "author"]

In [109]:
function_transformer = FunctionTransformer(
    np.reshape, kw_args={"newshape": -1}
)
enc1 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    function_transformer,
    CountVectorizer(stop_words="english", max_features=1000)
)

enc2 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    function_transformer,
    CountVectorizer(stop_words="english", max_features=1000)
)

In [110]:
#pipe = make_pipeline(enc)
preprocessor = make_column_transformer(
    (enc1, text_features1),
    (enc2, text_features2),
    ("drop", drop)
)

In [111]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('functiontransformer',
                                                  FunctionTransformer(func=<function reshape at 0x10fd91750>,
                                                                      kw_args={'newshape': -1})),
                                                 ('countvectorizer',
                                                  CountVectorizer(max_features=1000,
                                                                  stop_words='english'))]),
                                 ['title']),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                

In [114]:
new_cols = (
    preprocessor.named_transformers_["pipeline-1"].named_steps["countvectorizer"].get_feature_names_out().tolist() + 
    preprocessor.named_transformers_["pipeline-2"].named_steps["countvectorizer"].get_feature_names_out().tolist()
)

In [115]:
X_transfomed = preprocessor.transform(X_train)

In [116]:
pd.DataFrame(X_transfomed.toarray(), columns=new_cols)

Unnamed: 0,10,100,11,12,15,20,2016,2017,30,50,...,worth,written,wrong,wrote,year,years,yes,york,young,на
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,4,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
16636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
16637,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
16638,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0


### Modelling

__Split into Training and Validation data__

#### Base models 

In [117]:
pipe_lr = make_pipeline(preprocessor, LogisticRegression(max_iter=100000))
#pipe_dt = make_pipeline(preprocessor, DecisionTreeClassifier())
pipe_nb = make_pipeline(
    preprocessor,
    FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
    GaussianNB())
#pipe_svc = make_pipeline(preprocessor, SVC())
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier())
pipe_catboost = make_pipeline(preprocessor, CatBoostClassifier(verbose=False))

In [118]:
models = {
    "Logistic Regression": pipe_lr,
    #"Decision Tree": pipe_dt,
    "NB": pipe_nb,
    #"SVC": pipe_svc,
    "Random Forest": pipe_rf,
    "Cat boost": pipe_catboost
}

In [119]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [120]:
results = {}

for name, value in models.items():
    print(f"Start {name}!")
    results[name] = mean_std_cross_val_scores(
        value, X_train, y_train, cv=10, return_train_score=True
    )
    print(f"Done {name}!")
    display(pd.DataFrame(results))



Start Logistic Regression!
Done Logistic Regression!


Unnamed: 0,Logistic Regression
fit_time,7.271 (+/- 0.306)
score_time,0.613 (+/- 0.024)
test_score,0.964 (+/- 0.007)
train_score,0.998 (+/- 0.000)


Start NB!




Done NB!


Unnamed: 0,Logistic Regression,NB
fit_time,7.271 (+/- 0.306),6.866 (+/- 0.146)
score_time,0.613 (+/- 0.024),0.656 (+/- 0.025)
test_score,0.964 (+/- 0.007),0.869 (+/- 0.008)
train_score,0.998 (+/- 0.000),0.889 (+/- 0.003)


Start Random Forest!
Done Random Forest!


Unnamed: 0,Logistic Regression,NB,Random Forest
fit_time,7.271 (+/- 0.306),6.866 (+/- 0.146),17.725 (+/- 0.089)
score_time,0.613 (+/- 0.024),0.656 (+/- 0.025),0.653 (+/- 0.018)
test_score,0.964 (+/- 0.007),0.869 (+/- 0.008),0.973 (+/- 0.005)
train_score,0.998 (+/- 0.000),0.889 (+/- 0.003),1.000 (+/- 0.000)


Start Cat boost!
Done Cat boost!


Unnamed: 0,Logistic Regression,NB,Random Forest,Cat boost
fit_time,7.271 (+/- 0.306),6.866 (+/- 0.146),17.725 (+/- 0.089),27.518 (+/- 0.807)
score_time,0.613 (+/- 0.024),0.656 (+/- 0.025),0.653 (+/- 0.018),0.632 (+/- 0.033)
test_score,0.964 (+/- 0.007),0.869 (+/- 0.008),0.973 (+/- 0.005),0.976 (+/- 0.004)
train_score,0.998 (+/- 0.000),0.889 (+/- 0.003),1.000 (+/- 0.000),0.990 (+/- 0.000)


In [121]:
pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(func=<function reshape at 0x10fd91750>,
                                                                                       kw_args={'newshape': -1})),
                                                                  ('countvectorizer',
                                                                   CountVectorizer(max_features=1000,
                                                                                   stop_words='english'))]),
                                                  ['title'])...
            

In [137]:
pipe_lr.named_steps["logisticregression"].coef_

array([[-0.32858657,  0.03769198, -0.22062586, ..., -0.28690671,
        -0.57266297,  0.72208062]])

In [139]:
data = {
    "Importance": pipe_lr.named_steps["logisticregression"].coef_.flatten(),
    "abs_Importance": np.abs(pipe_lr.named_steps["logisticregression"].coef_.flatten())
}

pd.DataFrame(data=data, index=new_cols).sort_values(
    by="abs_Importance", ascending=False
)[:30]

Unnamed: 0,Importance,abs_Importance
breitbart,-8.652039,8.652039
york,-6.238666,6.238666
000,-4.560829,4.560829
times,-3.522039,3.522039
follow,-2.30734,2.30734
comment,1.835264,1.835264
october,1.803078,1.803078
migrants,-1.69889,1.69889
islamic,-1.664931,1.664931
migrant,-1.654532,1.654532


#### HyperParam Tune best performing models

### Prediction and Results