# Fake News Classifier

### Data Reading and Interpretation

__Import Statements__

In [2]:
import pandas as pd
import numpy as np

import re
import string
import altair as alt

import nltk

# nltk.download('words')
# nltk.download("cmudict")

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.corpus import cmudict

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

alt.data_transformers.enable('data_server')
alt.renderers.enable('mimetype')

RendererRegistry.enable('mimetype')

In [None]:
train_df = pd.read_csv("../data/raw/train.csv")

In [None]:
train_df.info()

In [None]:
train_df.describe(include="all")

### Preprocessing

#### Author Feature

In [None]:
# Create a feature "author_is_null"

def author_is_null(x):
    if x["author"] != x["author"]:
        return 0
    return 1

train_df["author_is_null"] = train_df.apply(lambda x: is_null_author(x), axis=1)

In [None]:
# Change Author = Null to Author = "Unknown"

unknown_authors_ids = train_df.query("author.isnull()")["id"]
train_df[updated_author_column_name] = np.where(~train_df['id'].isin(unknown_authors_ids), train_df['author'], 'Unknown')

In [None]:
# Others category if value_counts of an author is less than 5

less_frequent = train_df['author'].value_counts()[train_df['author'].value_counts() <= 5].index.tolist()
train_df['author'] = np.where(train_df['author'].isin(less_frequent), 'Other', train_df['author'])

In [None]:
# Create if "is_multiple_authors"

def is_multiple_authors(data):

    data["is_multiple_authors"] = [
        1 if " and " in str(author) else 0 for author in data["author"]
    ]

    return data


train_df = is_multiple_authors(train_df)

train_df.query("is_multiple_authors == 1")["label"].value_counts()

In [None]:
# Check if author name contains a domain suffix

def author_contains_domain(data):

    data["author_contains_domain"] = [
        1 if re.search(r"\.[a-zA-Z]{3}", str(author)) else 0 for author in train_df["author"]
    ]

    return data


train_df = author_contains_domain(train_df)

train_df.query("author_contains_domain == 1")["label"].value_counts()

#### Title Feature

In [None]:
# Check if title is null

def is_title_null(data):

    data["is_title_null"] = [
        0 if title == title
        else 1 for title in train_df["title"]
    ]

    return data


train_df = is_title_null(train_df)

train_df.query("is_title_null == 1")["label"].value_counts()

In [None]:
# Check if title ends with a famous journal name

def title_contains_famous_journal(data):

    data["title_contains_famous_journal"] = [
        1 if
        str(title).endswith("The New York Times") or
        str(title).endswith("Breitbart")
        else 0 for title in train_df["title"]
    ]

    return data


train_df = title_contains_famous_journal(train_df)

train_df.query("title_contains_famous_journal == 1")["label"].value_counts()

In [None]:
def no_of_words(data):

    data["no_of_words"] = [
        len(str(title).split(" ")) for title in train_df["title"]
    ]

    return data


train_df = no_of_words(train_df)

In [None]:
# alt.Chart(train_df).mark_bar().encode(
#     alt.X("no_of_words", bin=alt.Bin(maxbins=50)),
#     alt.Y("count()"),
#     color="label"
# )

In [None]:
def no_of_chars(data):

    data["no_of_chars"] = [
        len(str(title)) for title in train_df["title"]
    ]

    return data


train_df = no_of_chars(train_df)

In [None]:
# alt.Chart(train_df2).mark_bar().encode(
#     alt.X("no_of_chars", bin=alt.Bin(maxbins=100)),
#     alt.Y("count()"),
#     color="label"
# )

In [None]:
def get_text_length(text):
    """
    Returns the number of words in a text without punctuations. 
    Counts clitics as separate words.

    Parameters
    ----------
    text : str
        A text from which we find the number of words

    Returns
    -------
    An int which represents the number of words in the text
    """
    non_punc = []
    for word in word_tokenize(text):
        if word not in string.punctuation:
            non_punc.append(word)
    return len(non_punc)

In [None]:
def get_lexical_density(text):
    """
    Returns the lexical density of a text. That is the ratio of open class words.
    in the text

    Parameters
    ----------
    text : str
        A text from which we find the lexical density

    Returns
    -------
    A float which represents the lexical density
    """
    open_class_prefix = {"N", "V", "J", "R"}
    open_class_total = 0
    word_count = 0
    if len(text) == 0:
        return float(0)
    for word, pos in pos_tag(word_tokenize(text)):
        if word not in string.punctuation:
            word_count += 1
            if pos[0] in open_class_prefix:
                open_class_total += 1
    return open_class_total/word_count

In [None]:
def get_pos_count(text):
    """
    Counts the number of nouns, verbs and adjectives in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of nouns, verbs
        and adjectives

    Returns
    -------
    A tuple of (noun_count: int, verb_count: int, adj_count: int)
    which represents the number of nouns, verbs adjectives in the text
    respectively
    """
    noun_count = 0
    verb_count = 0
    adj_count = 0

    if len(text) == 0:
        return 0, 0, 0

    for word, pos in pos_tag(word_tokenize(text)):
        if(pos[0] == 'N'):
            noun_count += 1
        if(pos[0] == 'V'):
            verb_count += 1
        if(pos == 'JJ'):
            adj_count += 1
    return noun_count, verb_count, adj_count

In [None]:
def get_num_ovv_words(text):
    """
    Gets the number of out-of-vocabulary words in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of out-of-vocabulary
        words is to be found

    Returns
    -------
    The number of oov words in the text
    """
    text_vocab = set(w.lower() for w in text.split() if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    ovv_words = text_vocab - english_vocab

    return len(ovv_words)

In [None]:
def get_punctuations_count(text):
    """
    Returns the number of punctuations in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of punctuations present

    Returns
    -------
    punct_count: int
                 An integer which represents the number of punctuations in the text
    """
    punct_count = 0
    if len(text) == 0:
        return 0
    for word in word_tokenize(text):
        if word in string.punctuation:
            punct_count += 1
    return punct_count

In [None]:
def contains_says(data):

    data["contains_says"] = [
        1 if
        len(str(title).split(" ")) < 6

#         re.search("[^a-zA-Z0-9 .,:'\"-\\$()]", str(title))

#         re.search("[0-9]", str(title))

#         "Says" in str(title) or "says" in str(title)

        else 0 for title in train_df["title"]
    ]

    return data

In [None]:
train_df2 = contains_says(train_df)

In [None]:
train_df2.query("contains_says == 1")["label"].value_counts()

#### Text Feature

In [None]:
# # Check if text is empty

def is_text_empty(data):

    data["is_text_empty"] = [
        1 if text == " " or
        not text == text
        else 0 for text in train_df["text"]
    ]

    return data
# train_df.query("text == ' '")["label"].value_counts()

In [None]:
train_df2 = is_text_empty(train_df)

In [None]:
train_df2.query("is_text_empty == 1")["label"].value_counts()

#### Feature Engineering

### Modelling

### Prediction and Results