# Preprocessing the Kaggle Data

## 1. Imports 

Steps:
1. Import "pandas" to import a .csv file from the file system.
2. Import "TfidfVectorizer" to convert a collection of raw documents to a matrix of TF-IDF features.
3. Import "CountVectorizer" to convert a collection of text documents to a matrix of token counts.

In [3]:
import pandas as pd
import numpy as np

# Critical Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re, string

## 2. Read file

Steps: 
1. Importing datasets ( https://www.kaggle.com/code/vpkprasanna/basic-text-cleaning-wordcloud-and-n-gram-analysis#Merging-true-and-fake-news-dataset )
2. Converting datasets
3. Combining datasets

In [7]:

# importing the fake and the true dataset from the file system
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Convert each text of a dataset to a NumPy Array
fake_texts = fake["text"].to_numpy()
true_texts = true["text"].to_numpy()


# Combine both texts to a single text
all_texts = np.append(fake_texts, values=true_texts)
#Create variable with 0 and 1 dependent on the length of the text arrays
labels = np.append(np.zeros(len(fake_texts)), np.ones(len(true_texts)))

[0. 0. 0. ... 1. 1. 1.]


## 3. Preprocessing
Steps:
1. Converting Texts to lowercase 
2. Stopword Removal
3. Delete "Reuters"
4. Stemming
5. Pruning
6. Removing Twitter's '@' and dates (e.g. "Donald J. Trump (@realDonaldTrump) December 31, 2017Trump")
7. Store Preprocessing results


### Converting Text to Lowercase

In [9]:
lowercase = False

# Iterating through all texts setting them to lowercase
for i in range(0, all_texts.size):
    all_texts[i] = all_texts[i].lower()

### Stopword Removal & Delete "Reuters" / "reuters" & Stemming

In [11]:
# Importing Stopwords and Stemmer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

# Define a token pattern
token_pattern = re.compile(r"(?u)\b\w\w+\b") # split on whitespace

def tokenize(text):
    # Apply stopwords set from the english language. 
    my_stopwords = set(stopwords.words('english'))
    
    # Add custom words to the stopwords list
    my_stopwords.add("Reuters")
    my_stopwords.add("reuters")

    
    stemmer = PorterStemmer()
    stems = []
    
    # Find all items that match the previously defined token pattern in the text, that has been given as a parameter
    tokens = token_pattern.findall(text)
    for item in tokens:
        if item not in my_stopwords:
            # For every item that is not included in the stopwords list, add the stem of this word to the "stems" array. 
            stems.append(stemmer.stem(item))
    return stems



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yanni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [12]:
# avoid error: rerun first cell with imports before running this cell

# Vectorize the stems of all the words
stem_vectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=0.001, max_df=0.9) #critical values selected from research papers 
matrix = stem_vectorizer.fit_transform(all_texts)

In [13]:
# TODO Explanation

features = pd.DataFrame(matrix.toarray(), columns=stem_vectorizer.get_feature_names_out())
pd.set_option('display.max_columns', 50)
display(features.head())
print(features.keys())

Unnamed: 0,00,000,01,038,08,09,10,100,1000,100th,101,109,10th,11,110,111,112,115,116,118,11th,12,120,122,125,...,yuan,yugoslavia,yve,zach,zakharova,zanu,zarif,zarrab,zealand,zeid,zeitung,zero,zika,zimbabw,zimbabwean,zimmerman,zink,zionist,zip,zipper,zone,zor,zuckerberg,zuma,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Index(['00', '000', '01', '038', '08', '09', '10', '100', '1000', '100th',
       ...
       'zimmerman', 'zink', 'zionist', 'zip', 'zipper', 'zone', 'zor',
       'zuckerberg', 'zuma', 'zurich'],
      dtype='object', length=9144)


In [14]:
# Precautionary step: Check whether the custom stopwords"R/reuters" have been removed from the text
for colname in features.columns:
    if (colname == "reuters"):
        print("Reuters has been successfully removed")
    if (colname == "Reuters"):
        print("reuters has been successfully removed")

## Store preprocessing results
The preprocessed matrix and the label array are stored together

In [15]:
# Save a matrix to a file
import scipy.sparse
scipy.sparse.save_npz("preprocessed_matrix", matrix)
np.save("preprocessed_labels", labels)

In [16]:
#np.save("preprocessed_texts", all_texts)
#np.save("preprocessed_labels", labels)