<a href="https://colab.research.google.com/github/ysuter/FHNW-BSUD-Part2/blob/main/L6-InformationExtraction/L6-InformationExtraction/L06_Text_Preprocessing_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# L06 Text Preprocessing

### Word Tokenization

In [30]:
# --- Setup for Google Colab ---
!pip install nltk --quiet

import nltk
nltk.download('punkt')
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
# --- Tokenization Setup ---
import nltk
from nltk.tokenize import word_tokenize

def tokenize(text):
    """Tokenizes a string into words using NLTK's Punkt tokenizer."""
    tokens = word_tokenize(text)
    return tokens


In [32]:
# --- Example 1 ---
D1 = "Founder of the new tech company FUIT announces headquarter-to-be: New York"
tokens = tokenize(D1)
print("D1 tokens =", tokens)


D1 tokens = ['Founder', 'of', 'the', 'new', 'tech', 'company', 'FUIT', 'announces', 'headquarter-to-be', ':', 'New', 'York']


In [33]:
# --- Example 2 ---
D2 = "FUIT have changed their headquarter from New York to Zurich"
tokens = tokenize(D2)
print("D2 tokens =", tokens)


D2 tokens = ['FUIT', 'have', 'changed', 'their', 'headquarter', 'from', 'New', 'York', 'to', 'Zurich']


## Text Preprocessing - Stop Word Removal


In [34]:
# Define the documents
D1 = "FUIT offers very special IT services to their clients."
D2 = "Yesterday, all train services between London center and Heathrow airport had to be canceled."

# Define the query
query = "IT services"

# Define the list of stop words
stop_words = {"a", "the", "of", "to", "and", "in", "for", "is", "that", "on", "it", "with", "by"}

# Function to preprocess text (convert to lower case and remove stop words)
def preprocess(text):
    words = text.lower().split()
    return [word for word in words if word not in stop_words]

# Preprocess the documents and the query
processed_D1 = preprocess(D1)
processed_D2 = preprocess(D2)
processed_query = preprocess(query)

# Function to check if all query words are in the document
def contains_all_words(document, query):
    return all(word in document for word in query)

# Check which documents contain all query words
retrieved_documents = []
if contains_all_words(processed_D1, processed_query):
    retrieved_documents.append("D1")
if contains_all_words(processed_D2, processed_query):
    retrieved_documents.append("D2")



In [35]:
# Output the result
print("Retrieved Documents:", retrieved_documents)
print('processed D1', processed_D1)
print('processed D2', processed_D2)
print('processed query', processed_query)

Retrieved Documents: ['D1', 'D2']
processed D1 ['fuit', 'offers', 'very', 'special', 'services', 'their', 'clients.']
processed D2 ['yesterday,', 'all', 'train', 'services', 'between', 'london', 'center', 'heathrow', 'airport', 'had', 'be', 'canceled.']
processed query ['services']


## Stemming

In [36]:
# Define the original and stemmed documents
D1 = "Wandern im Schweizer Jura. Die schönsten Routen und besten Einkehrmöglichkeiten"
D1_stemmed = "wand im schweiz Jura. Die schon rout und best einkehrmog"
D2 = "So verschönern Sie Ihre Wände: Wandschmuck und die neusten Tapetenmuster"
D2_stemmed = "so verschon sie ihr wand: wandschmuck und die neust tapetenmust"

# Split the documents into words
D1_words = D1.split()
D1_stemmed_words = D1_stemmed.split()
D2_words = D2.split()
D2_stemmed_words = D2_stemmed.split()

# Create a dictionary to map stemmed words to original words for D1 and D2
stem_to_original_D1 = {stem: original for stem, original in zip(D1_stemmed_words, D1_words)}
stem_to_original_D2 = {stem: original for stem, original in zip(D2_stemmed_words, D2_words)}

# Find common stemmed words between D1 and D2
common_stems = set(D1_stemmed_words) & set(D2_stemmed_words)

# Identify overstemming examples
overstemming_examples = []
for stem in common_stems:
    if stem_to_original_D1[stem] != stem_to_original_D2[stem]:
        overstemming_examples.append((stem_to_original_D1[stem], stem_to_original_D2[stem]))




In [37]:
# Output the result
print("D1 words:", D1_words)
print("D1 stemmed words:", D1_stemmed_words)
print("D2 words:", D2_words)
print("D2 stemmed words:", D2_stemmed_words)
for example in overstemming_examples:
    print(f"Overstemming example: {example[0]},{example[1]}")

D1 words: ['Wandern', 'im', 'Schweizer', 'Jura.', 'Die', 'schönsten', 'Routen', 'und', 'besten', 'Einkehrmöglichkeiten']
D1 stemmed words: ['wand', 'im', 'schweiz', 'Jura.', 'Die', 'schon', 'rout', 'und', 'best', 'einkehrmog']
D2 words: ['So', 'verschönern', 'Sie', 'Ihre', 'Wände:', 'Wandschmuck', 'und', 'die', 'neusten', 'Tapetenmuster']
D2 stemmed words: ['so', 'verschon', 'sie', 'ihr', 'wand:', 'wandschmuck', 'und', 'die', 'neust', 'tapetenmust']


### Using the NLTK PorterStemmer

In [38]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

def stemming(input_text):
    """
    Applies stemming to the input text after removing stop words and punctuation.

    Args:
        input_text: The input text string.

    Returns:
        A list of stemmed words.
    """
    # Tokenize the input text
    words = word_tokenize(input_text)

    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words and word.isalnum()]

    # Initialize the Porter Stemmer
    stemmer = PorterStemmer()

    # Perform stemming on each word
    stemmed_words = [stemmer.stem(word) for word in words]

    return stemmed_words

In [39]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

def stemming_German(input_text):
  """Applies stemming to the input text.

  Args:
    input_text: The input text string.

  Returns:
    A stemmed string.
  """

  # Initialize the Snowball Stemmer for German
  stemmer = SnowballStemmer("german")

  # Tokenize the input text
  tokens = word_tokenize(input_text, language="german")

  # Remove punctuation and stop words
  stop_words = set(stopwords.words('german'))
  tokens = [token for token in tokens if token.isalnum() and token.lower() not in stop_words]

  # Stem the tokens and join them back into a string
  stemmed_tokens = [stemmer.stem(token) for token in tokens]
  stemmed_text = " ".join(stemmed_tokens)

  return stemmed_text

In [40]:
D1_stemmed2 = stemming(D1)
D2_stemmed2 = stemming(D2)
print('D1 stemmed =', D1_stemmed2)
print('D2 stemmed =', D2_stemmed2)

D1 stemmed = ['wandern', 'im', 'schweizer', 'jura', 'die', 'schönsten', 'routen', 'und', 'besten', 'einkehrmöglichkeiten']
D2 stemmed = ['verschönern', 'sie', 'ihr', 'wände', 'wandschmuck', 'und', 'die', 'neusten', 'tapetenmust']


In [41]:
# --- English stemming example ---
D1G = "This is an example sentence with stemming applied."
stemmed_text = stemming(D1G)
print('D1G    =', D1G)
print('stemmed', stemmed_text)


D1G    = This is an example sentence with stemming applied.
stemmed ['exampl', 'sentenc', 'stem', 'appli']


In [42]:
D1 = "Wandern im Schweizer Jura. Die schönsten Routen und besten Einkehrmöglichkeiten."
stemmed_text = stemming_German(D1)
print('D1    = ',D1)
print('stemmed: ',stemmed_text)

D1    =  Wandern im Schweizer Jura. Die schönsten Routen und besten Einkehrmöglichkeiten.
stemmed:  wand schweiz jura schon rout best einkehrmog
