In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources (organizes linguistic data)
try:
    stopwords.words('english')
    nltk.word_tokenize("example") # This implicitly requires 'punkt'
    WordNetLemmatizer().lemmatize("running") # This implicitly requires 'wordnet'
except LookupError as e:
    print(f"NLTK Resource not found: {e}")
    print("Downloading necessary NLTK resources...")
    nltk.download('punkt_tab')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('stopwords')
    nltk.download('wordnet')
    print("NLTK resources downloaded successfully.")

def load_farm_ads_data(text_file: str, vector_file: str):
    # Dictionary to store index:value pairs
    text_data = []
    # Reads the data and separates the label and text
    with open(text_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            text = ' '.join(parts[1:])
            text_data.append({'label': label, 'text': text})
    
    # Create pandas DataFrame where each item is a dictionary (key & value)
    text_df = pd.DataFrame(text_data)
    labels = text_df['label'].values
    texts = text_df['text'].values
    
    vector_data = []
    with open(vector_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:  # skip empty lines
                continue
            label = int(parts[0])
            features = {}
            for item in parts[1:]:
                idx, val = item.split(':')
                # '3:1' to idx = '3', val = '1'
                features[int(idx)] = float(val)
            vector_data.append(features)
    
    # Convert dictionaries into pandas DataFrame
    vector_df = pd.DataFrame(vector_data).fillna(0)
    
    return texts, labels, vector_df

def preprocess_text(text_series):
    """
        Preprocess text by removing special characters, removing stopwords ("the", "is", "and", etc.),
        and lemmatizing words (running, runs and ran to the base form "run".
        
        This reduces the dimensionality of text data and performance of downstram tasks liek text classificaiton or information retrieval
    """   
    cleaned_texts = []
    tokenized_texts = []
    
    for text in text_series:
        tokens = word_tokenize(text)
        # Store the tokenized version for Word2Vec
        tokenized_texts.append(tokens)
        
        # Join tokens back into a string
        cleaned_text = ' '.join(tokens)
        cleaned_texts.append(cleaned_text)
    
    # cleaned_texts for text classification and feature extration techiques like TF-IDF
    # tokenized_text for token list used lated on word embeddings (Word2Vec) or RNN
    return cleaned_texts, tokenized_texts

def visualize_data(labels, text, cleaned_texts):
    """
        Visualize data distributions and characteristics
    """
    # Distribution of classes with -1 and 1
    plt.figure(figsize=(8, 6)) #(width, height)
    sns.countplot(x=labels)
    plt.title('Distribution of Ad Classes')
    plt.xlabel('Class (-1: Not Accepted, 1: Accepted)')
    plt.ylabel('Count')
    plt.savefig('visualization/distributions.png')  
    plt.close()
    
    all_words = [word for text in cleaned_texts for word in text.split()]
    word_freq = pd.Series(all_words).value_counts()
    
    plt.figure(figsize=(12, 6))
    word_freq[:20].plot(kind='bar')
    plt.title('Top 20 Most Common Words')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('visualization/word_frequency.png')
    plt.close()
    
    # Compare word frequencies between classes
    accepted_words = [word for i, text in enumerate(cleaned_texts)
                      for word in text.split() if labels[i] == 1]
    rejected_words = [word for i, text in enumerate(cleaned_texts)
                      for word in text.split() if labels[i] == -1]
    
    accepted_freq = pd.Series(accepted_words).value_counts()[:15]
    rejected_freq = pd.Series(rejected_words).value_counts()[:15]
    
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    accepted_freq.plot(kind='bar')
    plt.title('Top Words in Accepted Ads')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    
    plt.subplot(1, 2, 2)
    rejected_freq.plot(kind='bar')
    plt.title('Top Words in Rejected Ads')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig('visualization/class_word_frequency.png')
    plt.close()

if __name__ == "__main__":
    # File paths
    text_file = "farm-ads"
    vector_file = "farm-ads-vect"
    
    # Load data
    texts, labels, vector_df = load_farm_ads_data(text_file, vector_file)
    
    print(f"Loaded {len(texts)} text samples with labels: {np.unique(labels)}")
    
    # Preprocess text data
    cleaned_texts, tokenized_texts = preprocess_text(texts)
    
    # Print the cleaned texts
    print("\nCleaned Texts:")
    for i, cleaned_text in enumerate(cleaned_texts[:5]):  # Print the first 5 
        print(f"Sample {i+1}: {cleaned_text}")

    # Print the tokenized texts
    print("\nTokenized Texts:")
    for i, tokens in enumerate(tokenized_texts[:5]):  # Print the first 5 
        print(f"Sample {i+1}: {tokens}")
        
    visualize_data(labels, texts, cleaned_texts)
    
    
    
    

Loaded 4143 text samples with labels: [-1  1]

Cleaned Texts:
Sample 1: ad-jerry ad-bruckheimer ad-chase ad-premier ad-sept ad-th ad-clip ad-bruckheimer ad-chase page found
Sample 2: ad-rheumatoid ad-arthritis ad-expert ad-tip ad-info ad-article ad-treatment ad-option ad-support title-understand title-rheumatoid title-arthritis title-everyday title-health header-understand header-rheumatoid header-arthritis understand rheumatoid arthritis everyday health root root act consumer root content everyday solution understand rheumatoid arthritis future ra treatment advance rheumatoid arthritis treatment expect future lead researcher ra treatment research exercise ra check tip slideshow help create workout program ra fitness tip question doctor print list rheumatoid arthritis question doctor visit list ra question understand rheumatoid arthritis tip manage rheumatoid arthritis pain mak key change help manage rheumatoid arthritis pain ease joint pain strive eat balance diet help healthy weight 