In [8]:
import spacy
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to load data
def load_data(categories):
    newsgroups_data = fetch_20newsgroups(subset='all', categories=categories,
                                         remove=('headers', 'footers', 'quotes'))
    return newsgroups_data.data[:200]  # Adjust the slice as needed

# Function to preprocess documents
def preprocess_documents(documents):
    processed_data = {
        'original_text': [],
        'tokenized_text': [],
        'hover_text': [],
        'text_no_punct_lemma': [],
        'text_no_punct': [],
        'ml_vectors': []  # Placeholder for machine learning model vectors
    }
    
    for doc in nlp.pipe(documents, disable=["ner", "parser"]):  # Disable unnecessary pipelines
        # Tokenized text

        tokens = [token.text for token in doc]
        processed_data['tokenized_text'].append(tokens)

        # Hover text
        hover_text = "<br>".join(tokens[:10]) + '...' if len(tokens) > 10 else " ".join(tokens)
        processed_data['hover_text'].append(hover_text)

        # Text with no punctuation and lemmatization
        text_no_punct_lemma = " ".join(token.lemma_ for token in doc if not token.is_punct and token.lemma_ != '-PRON-')
        processed_data['text_no_punct_lemma'].append(text_no_punct_lemma)

        # Text with no punctuation
        text_no_punct = " ".join(token.text for token in doc if not token.is_punct)
        processed_data['text_no_punct'].append(text_no_punct)

        # Keep original text for reference
        processed_data['original_text'].append(doc.text)
    
    # Generate machine learning model vectors for the lemmatized, no punctuation texts
    ml_vectors = model.encode(processed_data['original_text'], show_progress_bar=True)
    processed_data['ml_vectors'] = ml_vectors.tolist()

    # Convert processed data to DataFrame
    df = pd.DataFrame(processed_data)

    return df

# Define your categories
categories = ["rec.sport.baseball", "sci.space"]
documents = load_data(categories)
documents = [doc.replace("\n", " ") for doc in documents]
df_processed = preprocess_documents(documents)

# Save to a parquet file
df_processed.to_parquet('../data/processed_texts.parquet')

print("Preprocessing complete and data saved to 'processed_texts.parquet'.")


Batches: 100%|██████████| 7/7 [00:02<00:00,  3.25it/s]

Preprocessing complete and data saved to 'processed_texts.parquet'.



