In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources (organizes linguistic data)
try:
    stopwords.words('english')
    nltk.word_tokenize("example") # This implicitly requires 'punkt'
    WordNetLemmatizer().lemmatize("running") # This implicitly requires 'wordnet'
except LookupError as e:
    print(f"NLTK Resource not found: {e}")
    print("Downloading necessary NLTK resources...")
    nltk.download('punkt_tab')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('stopwords')
    nltk.download('wordnet')
    print("NLTK resources downloaded successfully.")

def load_farm_ads_data(text_file: str, vector_file: str):
    # Dictionary to store index:value pairs
    text_data = []
    # Reads the data and separates the label and text
    with open(text_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            text = ' '.join(parts[1:])
            text_data.append({'label': label, 'text': text})
    
    # Create pandas DataFrame where each item is a dictionary (key & value)
    text_df = pd.DataFrame(text_data)
    labels = text_df['label'].values
    texts = text_df['text'].values
    
    vector_data = []
    with open(vector_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            features = {}
            for item in parts[1:]:
                idx, val = item.split(':')
                # '3:1' to idx = '3', val = '1'
                features[int(idx)] = float(val)
            vector_data.append(features)
    
    # Convert dictionaries into pandas DataFrame
    vector_df = pd.DataFrame(vector_data).fillna(0)
    
    return texts, labels, vector_df

def preprocess_text(text_series):
    """
        Preprocess text by removing special characters, removing stopwords ("the", "is", "and", etc.),
        and lemmatizing words (running, runs and ran to the base form "run".
        
        This reduces the dimensionality of text data and performance of downstram tasks liek text classificaiton or information retrieval
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    cleaned_texts = []
    tokenized_texts = []
    
    for text in text_series:
        text = text.lower()
        text = text.replace('ad-', '')
        # Remove special characters (raw string that '^' match any character that is not a word or whitespace character
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text)
        # Remove stopwords and lemmatize
        filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        # Store the tokenized version for Word2Vec
        tokenized_texts.append(filtered_tokens)
        
        # Join tokens back into a string
        cleaned_text = ' '.join(filtered_tokens)
        cleaned_texts.append(cleaned_text)
    
    # cleaned_texts for text classification and feature extration techiques like TF-IDF
    # tokenized_text for token list used lated on word embeddings (Word2Vec) or RNN
    return cleaned_texts, tokenized_texts
    
if __name__ == "__main__":
    # File paths
    text_file = "farm-ads"
    vector_file = "farm-ads-vect"
    
    # Load data
    texts, labels, vector_df = load_farm_ads_data(text_file, vector_file)
    
    print(f"Loaded {len(texts)} text samples with labels: {np.unique(labels)}")
    
    # Preprocess text data
    cleaned_texts, tokenized_texts = preprocess_text(texts)
    
    # Print the cleaned texts
    print("\nCleaned Texts:")
    for i, cleaned_text in enumerate(cleaned_texts[:5]):  # Print the first 5 
        print(f"Sample {i+1}: {cleaned_text}")

    # Print the tokenized texts
    print("\nTokenized Texts:")
    for i, tokens in enumerate(tokenized_texts[:5]):  # Print the first 5 
        print(f"Sample {i+1}: {tokens}")

NLTK Resource not found: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/davidvuong/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

Downloading necessary NLTK resources...


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/davidvuong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/davidvuong/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidvuong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/davidvuong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NLTK resources downloaded successfully.
Loaded 4143 text samples with labels: [-1  1]


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/davidvuong/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
