In [1]:
import math
import re
import pandas as pd

In [2]:
train_df = pd.read_parquet('../Lab5/train.parquet')
val_df = pd.read_parquet('../Lab5/validation.parquet')
test_df = pd.read_parquet('../Lab5/test.parquet')

In [3]:
def extract_sentences(df, column="sentences"):
    """Extract plain text sentences from [{'text': ...}] structures."""
    return [d["text"] for row in df[column] for d in row]

train_sentences = extract_sentences(train_df)
val_sentences   = extract_sentences(val_df)
test_sentences  = extract_sentences(test_df)

In [4]:
def tokenize(sentence):
    """
    Splits a sentence into words, converts to lowercase, and removes punctuation.
    """
    # Ensure the input is a string
    if not isinstance(sentence, str):
        return []
    tokens = re.sub(r'[^\w\s]', '', sentence.lower()).split()
    return tokens

In [6]:
def compute_idf(train_corpus_tokens):
    """
    Calculates IDF scores for a vocabulary learned from the training corpus.
    :param train_corpus_tokens: A list of tokenized sentences from the training data.
    :return: A tuple containing (vocabulary, idf_scores_dict).
    """
    num_docs = len(train_corpus_tokens)
    
    # Build vocabulary and document frequency (DF) in one pass
    vocab = set()
    doc_freq = {}
    for doc in train_corpus_tokens:
        unique_words_in_doc = set(doc)
        for word in unique_words_in_doc:
            vocab.add(word)
            doc_freq[word] = doc_freq.get(word, 0) + 1
            
    sorted_vocab = sorted(list(vocab))
    
    # Calculate IDF scores using the document frequencies
    idf_scores = {
        word: math.log(num_docs / (doc_freq.get(word, 0) + 1))
        for word in sorted_vocab
    }
    
    print(f"Learned vocabulary with {len(sorted_vocab)} words and calculated IDF scores.")
    return sorted_vocab, idf_scores

In [7]:
def compute_tf(sentence_tokens):
    """
    Calculates TF scores for a single tokenized sentence.
    :param sentence_tokens: A list of words for one sentence.
    :return: A dictionary of {word: tf_score}.
    """
    doc_len = len(sentence_tokens)
    if doc_len == 0:
        return {}
        
    word_counts = {}
    for word in sentence_tokens:
        word_counts[word] = word_counts.get(word, 0) + 1
        
    tf_scores = {
        word: count / doc_len
        for word, count in word_counts.items()
    }
    return tf_scores

In [8]:
tokenized_train = [tokenize(s) for s in train_sentences]
tokenized_val = [tokenize(s) for s in val_sentences]
tokenized_test = [tokenize(s) for s in test_sentences]

In [9]:
vocabulary, idf_values = compute_idf(tokenized_train)
word_to_index = {word: i for i, word in enumerate(vocabulary)}

Learned vocabulary with 404480 words and calculated IDF scores.


In [None]:
# (Keep all your other functions like tokenize, compute_idf, compute_tf the same)

def create_tfidf_vectors(corpus_tokens, vocabulary, idf_scores, word_map):
    """
    Creates a memory-efficient sparse representation of TF-IDF vectors.
    Instead of a list of lists, it returns a list of dictionaries.
    """
    tfidf_vector_list = []
    
    for doc_tokens in corpus_tokens:
        tf_scores_doc = compute_tf(doc_tokens)
        
        # This dictionary will only store non-zero values for this sentence.
        sparse_vector = {} 
        
        for word, tf_val in tf_scores_doc.items():
            if word in word_map:
                index = word_map[word]
                # Store the TF-IDF score with its index as the key
                sparse_vector[index] = tf_val * idf_scores[word]
                
        tfidf_vector_list.append(sparse_vector)
        
    return tfidf_vector_list

In [None]:
print("\nVectorizing all datasets using a sparse representation...")
X_train_tfidf_sparse = create_tfidf_vectors(tokenized_train, vocabulary, idf_values, word_to_index)
X_val_tfidf_sparse = create_tfidf_vectors(tokenized_val, vocabulary, idf_values, word_to_index)
X_test_tfidf_sparse = create_tfidf_vectors(tokenized_test, vocabulary, idf_values, word_to_index)
print("...Done.")

print("\nSparse Vectorization Summary")
print(f"Number of training vectors: {len(X_train_tfidf_sparse)}")
print(f"Number of validation vectors: {len(X_val_tfidf_sparse)}")
print(f"Number of testing vectors: {len(X_test_tfidf_sparse)}")

print(f"\nFirst sparse vector (train): {X_train_tfidf_sparse[0]}")


Vectorizing all datasets using a sparse representation...
...Done.

--- Sparse Vectorization Summary ---
Number of training vectors: 3257770
Number of validation vectors: 3126
Number of testing vectors: 3098

First sparse vector (train): {288827: 0.32601781907202326, 311956: 0.35742971753019537, 354700: 0.2593349265147189, 246464: 0.32490371350288866, 148141: 0.03218466422406572, 355570: 0.37425119204449664, 342172: 0.13805699829218238, 248304: 0.38494336427150666, 219096: 0.3307244065191351, 361523: 0.0615495768521628, 374624: 0.33775986574192923, 161652: 0.10967732240295959, 334478: 0.10696103527814818, 266652: 0.2424960061341519, 234229: 0.18263677465871955, 222407: 0.31430469557883967, 194706: 0.12447864731312021, 332846: 0.09521231307741765, 388932: 0.028581179856532368}
