In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import scipy.sparse
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pickle
from tqdm import tqdm
from sklearn import utils
import multiprocessing
from nltk.tokenize import word_tokenize
import json

## Read Inputs

In [3]:
texts = np.load("preprocessed_texts.npy", allow_pickle = True)
labels = np.load("preprocessed_labels.npy", allow_pickle = True)

## Transformation

Configuration for Transformation

In [6]:
# TF-IDF | CountVectorization | BERT | DOC2VEC
method = "TF-IDF"
dimension = 200 # Doc2Vec vector dimension
epochs = 10 # Doc2Vec training epochs
min_df = 0.001 # TF-IDF min df
max_df = 0.9 # TF-IDF max df
use_idf = False 

with open('config.json', 'r') as openfile:
    json_object = json.load(openfile)
    json_object["transformation_method"] = method
    json_object["doc2vec_dimension"] = dimension
    json_object["doc2vec_epochs"] = epochs
    json_object["min_df"] = min_df
    json_object["max_df"] = max_df
    json_object["use_idf"] = use_idf
with open("config.json", "w") as outfile:
    outfile.write(json.dumps(json_object))

#### Count Vectorization
We cannot use this vectorization methods because the fake news differ in the length of texts from the true news

In [5]:
if method == "CountVectorization":
    vectorizer = CountVectorizer(analyzer="word")
    all_tokens = vectorizer.fit_transform(texts)


#### TF-IDF Vectorization

In [10]:
if method == "TF-IDF":
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, use_idf=use_idf) #critical values selected from research papers 
    all_tokens = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    print(f"{len(feature_names)} features after using vectorizer.")

13691 features after using vectorizer.


#### BERT

In [66]:
if method == "BERT":
    tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
    tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

    print(f'BERT model selected           : {tfhub_handle_encoder}')
    print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

    
    bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    #if "OSError: SavedModel file does not exist" occurs, navigate to the indicated folder and delete it
    
    text_test = texts
    text_preprocessed = bert_preprocess_model(text_test)

    #print(text_test)
    print(f'Keys       : {list(text_preprocessed.keys())}')
    print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
    print(f'Word Ids   : {text_preprocessed["input_word_ids"]}')
    print(f'Input Mask : {text_preprocessed["input_mask"]}')
    print(f'Type Ids   : {text_preprocessed["input_type_ids"]}')

#### DOC2Vec
Converting the document into vectors

In [68]:
if method == "DOC2VEC":
    tagged_texts = []
    for i, text in enumerate(texts):
            word_tokens = word_tokenize(text)
            tagged_texts.append(TaggedDocument(word_tokens, [i]))
    cores = multiprocessing.cpu_count()

    model = Doc2Vec(dm=0, vector_size=dimension, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
    model.build_vocab([x for x in tqdm(tagged_texts)])
    for epoch in range(epochs):
        model.train(utils.shuffle([x for x in tqdm(tagged_texts)]), total_examples=len(tagged_texts), epochs=1)
        model.alpha -= 0.002
        model.min_alpha = model.alpha
    vector_list = []
    for i in range(len(texts)):
        vector_list.append(model.dv[i])
    all_tokens = scipy.sparse.csr_matrix(vector_list)

sdf


# Store Results

In [11]:
if method == "TF-IDF" or method == "DOC2VEC":
    scipy.sparse.save_npz("training_input", all_tokens)
    np.save("training_labels", labels)
if method == "TF-IDF":
    np.save("feature_names", feature_names)

if method == "BERT":
    a_file = open("bert_preprocessed.pkl", "wb")
    pickle.dump(text_preprocessed, a_file)
    a_file.close()
    np.save("training_labels", labels)