# NLP - Text Pre-Processing
## Import Libraries

In [1]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

import warnings
warnings.filterwarnings('ignore')

# Download necessary NLTK packages
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('omw-1.4')



## Sample text for demonstration

In [2]:
sample_text = """In the summer of 2024, Dr. Emily Rivera and her team at the University of California, Berkeley, 
embarked on a project: to develop an AI capable of understanding complex legal documents. 
Despite the challenges—differing formats, ambiguous language, and nuanced legal jargon—they achieved remarkable success. 
By September, their prototype had parsed over 1,000 documents, correctly identifying key legal terms with 98.5% accuracy. 
What's next for Dr. Rivera's team? 'The sky's the limit,' she says."""


## Functions for Pre-Processing

In [3]:
# 1. Noise Removal
def remove_noise(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\s+\W|\W+\s|\W+', ' ', text)  # Remove punctuations and special characters
    return text.strip()

# 2. Tokenization
def tokenize_text(text):
    return word_tokenize(text)

# 3. Text Normalization (lowercasing)
def normalize_text(tokens):
    return [token.lower() for token in tokens]

# 4. Lemmatization
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# 5. Stopword Removal
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# 6. Part-of-Speech Tagging
def pos_tagging(tokens):
    return pos_tag(tokens)

## Text Pre-processing Pipeline

In [4]:
def preprocess_text(text):
    # Step-by-step pre-processing
    text_without_noise = remove_noise(text)
    tokens = tokenize_text(text_without_noise)
    normalized_tokens = normalize_text(tokens)
    tokens_without_stopwords = remove_stopwords(normalized_tokens)
    lemmatized_tokens = lemmatize_tokens(tokens_without_stopwords)
    pos_tags = pos_tagging(lemmatized_tokens)

    # Display the results
    print("Original Text:", text)
    print("\nText without Noise:", text_without_noise)
    print("\nTokens:", tokens)
    print("\nNormalized Tokens:", normalized_tokens)
    print("\nTokens without Stopwords:", tokens_without_stopwords)
    print("\nLemmatized Tokens:", lemmatized_tokens)
    print("\nPart-of-Speech Tags:", pos_tags)
    
    # Generate Token IDs after the entire preprocessing
    token_ids = {token: idx for idx, token in enumerate(lemmatized_tokens)}
    print("\nToken IDs:", token_ids)

## Output

In [5]:
# Run the text preprocessing on sample text
preprocess_text(sample_text)

Original Text: In the summer of 2024, Dr. Emily Rivera and her team at the University of California, Berkeley, 
embarked on a project: to develop an AI capable of understanding complex legal documents. 
Despite the challenges—differing formats, ambiguous language, and nuanced legal jargon—they achieved remarkable success. 
By September, their prototype had parsed over 1,000 documents, correctly identifying key legal terms with 98.5% accuracy. 
What's next for Dr. Rivera's team? 'The sky's the limit,' she says.

Text without Noise: In the summer of  Dr Emily Rivera and her team at the University of California Berkeley embarked on a project to develop an AI capable of understanding complex legal documents Despite the challenges differing formats ambiguous language and nuanced legal jargon they achieved remarkable success By September their prototype had parsed over  documents correctly identifying key legal terms with  accuracy What s next for Dr Rivera s team  The sky s the limit she sa