# Preprocessing the Kaggle Data

## 1. Imports 

Steps:
1. Import "pandas" to import a .csv file from the file system.
2. Import "TfidfVectorizer" to convert a collection of raw documents to a matrix of TF-IDF features.
3. Import "CountVectorizer" to convert a collection of text documents to a matrix of token counts.

In [2]:
import pandas as pd
import numpy as np

# Critical Imports
from sklearn.pipeline import Pipeline
from sklearn.pipeline import TransformerMixin 

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import nltk


nltk.download('stopwords')
nltk.download("punkt")
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 2. Read file

Steps: 
1. Importing datasets ( https://www.kaggle.com/code/vpkprasanna/basic-text-cleaning-wordcloud-and-n-gram-analysis#Merging-true-and-fake-news-dataset )
2. Converting datasets
3. Combining datasets

In [None]:

# importing the fake and the true dataset from the file system
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Convert each text of a dataset to a NumPy Array
fake_texts = fake["text"].to_numpy()
true_texts = true["text"].to_numpy()


# Combine both texts to a single text
all_texts = np.append(fake_texts, values=true_texts)
#Create variable with 0 and 1 dependent on the length of the text arrays
labels = np.append(np.zeros(len(fake_texts)), np.ones(len(true_texts)))

## 3. Preprocessing
Steps:
1. Converting Texts to lowercase 
2. Stopword Removal
3. Delete "Reuters"
4. Stemming
5. Pruning
6. Removing Twitter's '@' and dates (e.g. "Donald J. Trump (@realDonaldTrump) December 31, 2017Trump")
7. Store Preprocessing results


### Preprossing Configuration
Here you can define the preprossesing configuration 

In [10]:
lowercase = True
replace = [
    ["\u201c", '"'],
]
remove = [
    "Reuters",
    "@",
    "Donald",
    "J.",
    "Trump"
]
with open("config.json", "w") as outfile:
    outfile.write(json.dumps({}))
stemmer = None # None | "porter" | "snowball"
with open('config.json', 'r') as openfile:
    json_object = json.load(openfile)
    json_object["lowercase"] = str(lowercase)
    json_object["stemmer"] = stemmer
print(json_object)
with open("config.json", "w") as outfile:
    outfile.write(json.dumps(json_object))

{'lowercase': 'True', 'stemmer': None}


### Text to lowercase transformer
This transformer can transform a text to lowercase

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class LowerCaseTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        result = []
        for x in X:
            processed = x.lower()
            result.append(processed)
        return np.array(result)

### Replace similar words transformer
This transformer replaces words or regular expression by other words

In [None]:
class WordReplacementTransformer(TransformerMixin):
    replace = []

    def __init__(self, replace):
        self.replace = replace

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):            
        result = []
        for x in X:
            processed = x
            for replace in self.replace:
                processed = processed.replace(replace[0], replace[1])
            result.append(processed)
        return np.array(result)

### Stopword Removal

In [None]:
from sklearn.base import TransformerMixin

class StopwordTransformer(TransformerMixin):
    stop_words = set(stopwords.words('english'))

    def __init__(self, extra_words):
        for word in extra_words:
            self.stop_words.add(word.lower())

    def fit(self, X, y= None):
        return self

    def transform(self, X, y=None):
        result = []
        for x in X:
            word_tokens = word_tokenize(x)
            filtered_sentence = [w for w in word_tokens if not w.lower() in self.stop_words]
            result_text = ""
            for word in filtered_sentence:
                result_text += word + " "
            result.append(result_text)
        return np.array(result)


### Stemming Transformer

In [None]:
class StemmingTransformer(TransformerMixin):

    def __init__(self, stemmer_name):
        if (stemmer_name == "porter"):
            self.stemmer = PorterStemmer()
        else:
            self.stemmer = SnowballStemmer("english")

    def fit(self, X, y= None):
        return self

    def transform(self, X, y=None):
        result = []
        for x in X:
            word_tokens = word_tokenize(x)
            stemmed_sentence = [self.stemmer.stem(word) for word in word_tokens]
            result_text = ""
            for word in stemmed_sentence:
                result_text += word + " "
            result.append(result_text)
        return np.array(result)


### Run preprossesing

In [None]:
test_texts = np.array(all_texts)

steps = []

if lowercase:
    steps.append(("to_lowercase", LowerCaseTransformer()))

steps.append(("replace_words", WordReplacementTransformer(replace=replace)))
steps.append(("remove_stopwords", StopwordTransformer(extra_words=remove)))

if (stemmer == "porter"):
    steps.append(("use_porter_stemmer", StemmingTransformer("porter")))
if stemmer == "snowball":
    steps.append(("use_snowball_stemmer", StemmingTransformer("snowball")))

pipe = Pipeline(
    steps=steps
)

transformed = pipe.fit_transform(test_texts)

### Store preprocessing results
The preprocessed texts and the label array are stored together

In [None]:
np.save("preprocessed_texts", transformed)
np.save("preprocessed_labels", labels)