In [20]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pickle

## Read Inputs

In [10]:
texts = np.load("preprocessed_texts.npy", allow_pickle = True)
labels = np.load("preprocessed_labels.npy", allow_pickle = True)

## Transformation

Configuration for Transformation

In [4]:
# TF-IDF | CountVectorization | BERT | DOC2VEC
method = "BERT"

Count Vectorization

In [4]:
if method == "CountVectorization":
    vectorizer = CountVectorizer(analyzer="word")
    all_tokens = vectorizer.fit_transform(texts)


TF-IDF Vectorization

In [5]:
if method == "TF-IDF":
    vectorizer = TfidfVectorizer(min_df=0.0001, max_df=0.9, use_idf=False) #critical values selected from research papers 
    all_tokens = vectorizer.fit_transform(texts)


In [6]:
# TODO Explanation

features = pd.DataFrame(all_tokens.toarray(), columns=vectorizer.get_feature_names_out())
pd.set_option('display.max_columns', 50)
display(features.head())
print(features.keys())

Unnamed: 0,00,000,0000,000th,001,002,003,005,005380,006,00684,007,008,00am,00pm,01,010,0100,011,014,015,016,019,02,020,...,zloti,zodiac,zoe,zoellick,zoido,zoltan,zombi,zon,zone,zones,zoo,zoom,zoomph,zor,zoran,zour,zucker,zuckerberg,zuckerman,zulia,zulu,zuma,zurich,zweli,zynga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Index(['00', '000', '0000', '000th', '001', '002', '003', '005', '005380',
       '006',
       ...
       'zour', 'zucker', 'zuckerberg', 'zuckerman', 'zulia', 'zulu', 'zuma',
       'zurich', 'zweli', 'zynga'],
      dtype='object', length=29521)


In [7]:
# Precautionary step: Check whether the custom stopwords"R/reuters" have been removed from the text
for colname in features.columns:
    if (colname == "reuters"):
        print("Reuters has been successfully removed")
    if (colname == "Reuters"):
        print("reuters has been successfully removed")

Reuters has been successfully removed


Look at common words

In [8]:
len(vectorizer.get_feature_names_out())

29521

BERT

In [16]:
if method == "BERT":
    tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
    tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

    print(f'BERT model selected           : {tfhub_handle_encoder}')
    print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

    
    bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    #if "OSError: SavedModel file does not exist" occurs, navigate to the indicated folder and delete it
    
    text_test = texts
    text_preprocessed = bert_preprocess_model(text_test)

    #print(text_test)
    print(f'Keys       : {list(text_preprocessed.keys())}')
    print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
    print(f'Word Ids   : {text_preprocessed["input_word_ids"]}')
    print(f'Input Mask : {text_preprocessed["input_mask"]}')
    print(f'Type Ids   : {text_preprocessed["input_type_ids"]}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
['wish american happi new year leav . instead , give shout enemi , hater dishonest fake news media . former realiti show star one job . countri rapidli grow stronger smarter , want wish friend , support , enemi , hater , even dishonest fake news media , happi healthi new year , presid angri pant tweet . 2018 great year america ! countri rapidli grow stronger smarter , want wish friend , support , enemi , hater , even dishonest fake news media , happi healthi new year . 2018 great year america ! ( realdonaldtrump ) decemb 31 , 2017trump tweet went welll expect.what kind presid send new year greet like despic , petti , infantil gibberish ? ! lack decenc even allow rise gutter long enough wish american citizen happi new year ! bishop talbert swan ( talbertswan ) decemb 31 , 2017no one like calvin ( calvinst

DOC2Vec

In [10]:
if method == "DOC2VEC":
    # TODO
    print("sdf")

# Store Results

In [21]:
if method == "TF-IDF":
    scipy.sparse.save_npz("training_input", all_tokens)
    np.save("training_labels", labels)

if method == "BERT":
    a_file = open("bert_preprocessed.pkl", "wb")
    pickle.dump(text_preprocessed, a_file)
    a_file.close()
    np.save("training_labels", labels)