# Preprocessing Workflow :

- Removing References & Reference Numbers
- Removing Page Numbers
- Removing Stop words
- Removing Punctuation
- Removing URLs
- Lowercasing
- Tokenisation


In [None]:
import re

# defining lambda function for reading text
read = lambda src : open(src,"r",errors='ignore').read()

# test-reading a paper
text = read("/Users/tayssirboukrouba/Downloads/dataset/text/0001001v1.txt")
print(text)

# Testing RegEx Preprocessing Techniques :

### Removing in-text references :

In [None]:
x = re.sub(r"\[\d{1,2}\]|\(\d{1,2}\)","", text)
print(x)

### Removing Page numberings :

In [None]:
x = re.sub(r"\s\d{1,}\n|^[a-zA-Z]\n|^[0-9]{1,2}\n","", text,flags=re.MULTILINE)
print(x)

### Removing URLs :

In [None]:
x = re.sub(r'http\S+|www.\S+','',text)
print(x)

### Removing References :

In [None]:
pattern = re.compile(r'(?i)(References|Bibliography|Works Cited)(.*)',re.DOTALL)
x = re.split(pattern, text)[0]
print(x)

### Creating `regex_preprocess()` function :

In [None]:
def regex_preprocess (text) :
  """
    Preprocesses the input text by applying various regular expression-based transformations.

    Steps involved in preprocessing:
    1. Removes page numberings and single-lettered lines.
    2. Removes in-text references in the form of numbers enclosed in square or round brackets.
    3. Removes everything after and including references, bibliography, or works cited sections.
    4. Removes all punctuation.
    5. Removes all punctuation except for mathematical operation symbols (+, -, *, /) and parentheses/brackets.
    6. Removes URLs.

    Parameters:
    text (str): The input text to be preprocessed.

    Returns:
    str: The preprocessed text.
    """
  # getting rid of page numberings + one-lettered objects
  a = re.sub(r"\s\d{1,}\n|^[a-zA-Z]\n|^[0-9]{1,2}\n","", text,flags=re.MULTILINE)

  # getting rid of in-text references
  pattern = r"\[\d{1,2}\]|\\d{1,2}\)"
  b = re.sub(pattern,"", a)

  # getting rid of references and everything afterwards
  pattern = re.compile(r'(References|REFERENCES|Bibliography|Works Cited)\n(.*)', re.IGNORECASE | re.DOTALL)
  c = re.split(pattern, b)[0]


  # getting rid of URLs
  pattern = r'http\S+|www.\S+'
  d = re.sub(pattern,'',c)

  # getting rid of double space :
  #x = re.sub(r"[\n\n]+",'\n',x)

  return d

In [None]:
# testing it on a paper
clean_text = regex_preprocess(text)
print(clean_text)

# Stop-words & Tokenisation Preprocessing :

In [None]:
pip install nltk

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize , sent_tokenize

nltk.download('stopwords')
nltk.download('punkt')

## Testing on sample text

In [None]:
stop_words = set(stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)
sentences = sent_tokenize(text)
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
# checking first tockenized sentence
tokenized_sentences[0]

In [None]:
joint_sentences = []

for tokens in tokenized_sentences:
    joint_tokens = ' '.join([token.translate(punctuation_table) for token in tokens if token.lower() not in stop_words])
    joint_sentences.append(joint_tokens)

print("showing random originql sentence : \n " , ' '.join(tokenized_sentences[42]))
print("showing random filtered sentence : \n " , joint_sentences[42])

In [None]:
processed_text = '\n'.join(joint_sentences)
print(processed_text[1000:2000])

## Creating `remove_stop_words()` function :

In [None]:
def remove_stop_words(text) :
  """
    Remove stop words from the input text.

    Parameters:
    text (str): Input text containing sentences to be processed.

    Returns:
    str: Processed text where stop words have been removed from each sentence.
         Sentences are separated by newline characters

    Steps:
    1. Tokenizes the input text into sentences using NLTK's sent_tokenize.
    2. Tokenizes each sentence into words using NLTK's word_tokenize and converts them to lowercase.
    3. Removes English stop words using NLTK's stopwords.words('english').
    4. Joins the remaining tokens back into sentences, preserving sentence boundaries.
    5. Returns the processed text where each sentence is on a new line.
    """

  # defining english stop words
  stop_words = set(stopwords.words('english'))
  # getting sentence tokens
  sentences = sent_tokenize(text)
  # getiing word tokens from sentence tokens
  tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
  # deifning reconstructed text list
  joint_sentences = []

  # looping over word tokens in each sentence
  for tokens in tokenized_sentences:
      # reconstructing sentence from cleaned tokens
      joint_tokens = ' '.join([token for token in tokens if token.lower() not in stop_words])
      # appending sentence to the full text list
      joint_sentences.append(joint_tokens)

  # joining sentences (getting full text)
  processed_text = '\n'.join(joint_sentences)

  # returning cleaned text
  return processed_text

In [None]:
help(remove_stop_words)

In [None]:
text = read("/Users/tayssirboukrouba/Downloads/dataset/text/0001001v1.txt")
processed_text = remove_stop_words(text)
print(processed_text)

## Removing Custom Punctuation :

In [None]:
# getting rid of punctuation (except mathematical operations)
remove_punct = lambda text : re.sub(r'[.,-:?;\"\']+',"",text)
text = remove_punct(text)
print(text[1000:2000])

# Stemming & Lemmatization :

## Testing on sample text :

In [None]:
# to do : DONT FORGET TO ADD THEM INTO REQUIREMENTS.txt file
!pip install spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
text = read("/Users/tayssirboukrouba/Downloads/dataset/text/0001001v1.txt")
old_text = text[1000:2000]
doc = nlp(old_text)
tokens_list = []
for token in doc :
  print(token,"===>",token.lemma_)
  tokens_list.append(token.lemma_)

new_text = " ".join(tokens_list)

In [None]:
print("old text :\n", old_text )
print("-"*30)
print("lemmatised text :\n", new_text)

## Testing on full text :

In [None]:
old_text = read("/Users/tayssirboukrouba/Downloads/dataset/text/0001001v1.txt")
doc = nlp(old_text)
tokens_list = []
for token in doc :
  tokens_list.append(token.lemma_)

new_text = " ".join(tokens_list)

In [None]:
print("old text :\n", old_text[400:600] )
print("-"*69)
print("lemmatised text :\n", new_text[500:700] )

## Creating `lemmatize_text()` function :


In [None]:
def lemmatize_text(text) :
  """
    Lemmatizes the input text using SpaCy's en_core_web_sm model.

    Args:
    - text (str): The input text to be lemmatized.

    Returns:
    - str: The lemmatized text where each word is replaced by its lemma.

    Steps:
    1. Loads SpaCy's English model 'en_core_web_sm'.
    2. Tokenizes the input text into words.
    3. Lemmatizes each word to its base form.
    4. Joins the lemmatized words back into a single string.
    5. Returns the lemmatized text.
    """

  # loading spacy dict
  nlp = spacy.load('en_core_web_sm')
  # word tokenization
  doc = nlp(text)
  # defining tokens list
  tokens_list = []

  # looping over word tokens
  for token in doc :
    # replacing words by their lemma
    tokens_list.append(token.lemma_)

  # appending lemmas into text
  new_text = " ".join(tokens_list)

  return new_text

In [None]:
help(lemmatize_text)

In [None]:
old_text = read("/Users/tayssirboukrouba/Downloads/dataset/text/0001001v1.txt")
new_text = lemmatize_text(old_text)

In [None]:
print("old text :\n", old_text[400:600] )
print("-"*69)
print("lemmatised text :\n", new_text[500:700] )

# Combining Preprocessing Pipeline :

In [None]:
def preprocess_text(text) :
  """
    Preprocesses the input text by applying several text preprocessing steps.

    Args:
    - text (str): The input text to be preprocessed.

    Returns:
    - str: The preprocessed text after applying regex preprocessing, stop-words removal,
           custom punctuation removal, and lemmatization.

    Steps:
    1. Applies regex preprocessing to clean the text (function `regex_preprocess`).
    2. Removes stop words from the text (function `remove_stop_words`).
    3. Removes custom punctuation from the text (function `remove_punct`).
    4. Lemmatizes the text to replace words with their base forms (function `lemmatize_text`).
    5. Returns the preprocessed text.
    """

  # regex preprocessing
  a = regex_preprocess(text)

  # stop-words preprocessing
  b = remove_stop_words(a)

  # removing custom punctuation
  c = remove_punct(b)

  # lemmatizing text
  d = lemmatize_text(c)

  return d

In [None]:
help(preprocess_text)

In [None]:
text = read("/Users/tayssirboukrouba/Downloads/dataset/text/0001001v1.txt")
new = preprocess_text(text)

In [None]:
print(new)

# Applying The pipeline to the dataset

In [None]:
pip install tqdm

In [None]:
write = lambda filename, text: open(filename, 'w').write(text)

filename = '/Users/tayssirboukrouba/Downloads/example.txt'
text = 'Hello, my name is taissir'
write(filename, text)

In [None]:
from tqdm import tqdm 
import os

In [None]:
# defining paths 
text_path =  "/Users/tayssirboukrouba/Downloads/dataset/text/"
save_path = "/Users/tayssirboukrouba/Downloads/dataset/cleaned_text/"

for root, directories, files in os.walk(text_path):
  # Access files within the current directory (root)
  for filename in tqdm(files, desc="Processing text files"):
    # getting file save and read paths
    read_filepath = os.path.join(root, filename)
    save_filepath = os.path.join(save_path,filename)
    if not os.path.exists(save_filepath):
        # reading the text file 
        text = read(read_filepath) 
        # preprocessing the text 
        cleaned_text = preprocess_text(text) 
        write(save_filepath,cleaned_text)
        print(f"{filename} preprocessed and saved successfully in the new directory !")
    else:
        print(f"Skipped '{filename}' (already exists)")