In [14]:
import pandas as pd
df = pd.read_csv("brown.csv")
corpus = df['tokenized_text'].tolist()

corpus[:5]

['Furthermore , as an encouragement to revisionist thinking , it manifestly is fair to admit that any fraternity has a constitutional right to refuse to accept persons it dislikes .',
 'The Unitarian clergy were an exclusive club of cultivated gentlemen -- as the term was then understood in the Back Bay -- and Parker was definitely not a gentleman , either in theology or in manners .',
 'Ezra Stiles Gannett , an honorable representative of the sanhedrin , addressed himself frankly to the issue in 1845 , insisting that Parker should not be persecuted or calumniated and that in this republic no power to restrain him by force could exist .',
 "Even so , Gannett judiciously argued , the Association could legitimately decide that Parker `` should not be encouraged nor assisted in diffusing his opinions by those who differ from him in regard to their correctness '' .",
 'We today are not entitled to excoriate honest men who believed Parker to be downright pernicious and who barred their pulp

In [17]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")


def clean_text(documents: list[str]):
    cleaned_docs = []
    for doc in documents:
        doc_text = re.sub(r"[^\w\s]", "", doc.lower())
        doc_nlp = nlp(doc_text)
        filtered_text = [token.text for token in doc_nlp if not token.is_stop and token.text.strip()]
        cleaned_docs.append(filtered_text)

    return cleaned_docs

cleaned_corpus = clean_text(corpus[:10000])
cleaned_corpus[:5]


[['furthermore',
  'encouragement',
  'revisionist',
  'thinking',
  'manifestly',
  'fair',
  'admit',
  'fraternity',
  'constitutional',
  'right',
  'refuse',
  'accept',
  'persons',
  'dislikes'],
 ['unitarian',
  'clergy',
  'exclusive',
  'club',
  'cultivated',
  'gentlemen',
  'term',
  'understood',
  'bay',
  'parker',
  'definitely',
  'gentleman',
  'theology',
  'manners'],
 ['ezra',
  'stiles',
  'gannett',
  'honorable',
  'representative',
  'sanhedrin',
  'addressed',
  'frankly',
  'issue',
  '1845',
  'insisting',
  'parker',
  'persecuted',
  'calumniated',
  'republic',
  'power',
  'restrain',
  'force',
  'exist'],
 ['gannett',
  'judiciously',
  'argued',
  'association',
  'legitimately',
  'decide',
  'parker',
  'encouraged',
  'assisted',
  'diffusing',
  'opinions',
  'differ',
  'regard',
  'correctness'],
 ['today',
  'entitled',
  'excoriate',
  'honest',
  'men',
  'believed',
  'parker',
  'downright',
  'pernicious',
  'barred',
  'pulpits',
  'dema

In [18]:
from collections import Counter

def build_vocab(corpus: list[str]):
    vocab = Counter(term for doc in corpus for term in doc)
    word_to_idx = {word: idx for idx, (word, _) in enumerate(vocab.items())}
    idx_to_word = {idx: word for idx, (word, _) in enumerate(vocab.items())}
    return word_to_idx, idx_to_word

word_to_idx, idx_to_word = build_vocab(cleaned_corpus)
print(word_to_idx)
print(idx_to_word)
print(len(word_to_idx))

18516


In [8]:
def create_skipgram_pairs(corpus: list[str], window_size=2):
    """
    Generate context-target pairs for Skip-gram model training from a sequence of words.
    
    Args:
        sequence (list): A list of words (e.g., tokenized sentence or text).
        window_size (int): The number of words to consider as context on either side of the target word.
                           Default is 2 (looks 2 words before and after).
    
    Returns:
        list: A list of tuples, where each tuple is (target_word, context_word).
              The target word is the word being predicted, and the context word is one of its surrounding words.
    
    Example:
        >>> sequence = ['the', 'quick', 'brown', 'fox', 'jumps']
        >>> create_skipgram_pairs(sequence, window_size=2)
        [('the', 'quick'), ('the', 'brown'), ('quick', 'the'), ('quick', 'brown'),
         ('quick', 'fox'), ('brown', 'the'), ('brown', 'quick'), ('brown', 'fox'),
         ('brown', 'jumps'), ('fox', 'quick'), ('fox', 'brown'), ('fox', 'jumps'),
         ('jumps', 'brown'), ('jumps', 'fox')]
    """
    # Initialize an empty list to store the context-target pairs
    pairs = []
    
    # Iterate over each word in the sequence as the target word
    for document in corpus:
        for target_idx in range(len(document)):
            target_word = document[target_idx]
            
            # Define the window boundaries
            # Start: maximum of 0 (start of document) and target_idx - window_size
            # End: minimum of document length and target_idx + window_size + 1
            start_idx = max(0, target_idx - window_size)
            end_idx = min(len(document), target_idx + window_size + 1)
            
            # Generate pairs with context words within the window
            for context_idx in range(start_idx, end_idx):
                # Skip the target word itself (no self-pairing)
                if context_idx != target_idx:
                    context_word = document[context_idx]
                    # Add the (target, context) pair to the list
                    pairs.append((target_word, context_word))
    
    return pairs

pairs = create_skipgram_pairs(cleaned_corpus[:3], window_size=2)
pairs


[('furthermore', 'encouragement'),
 ('furthermore', 'revisionist'),
 ('encouragement', 'furthermore'),
 ('encouragement', 'revisionist'),
 ('encouragement', 'thinking'),
 ('revisionist', 'furthermore'),
 ('revisionist', 'encouragement'),
 ('revisionist', 'thinking'),
 ('revisionist', 'manifestly'),
 ('thinking', 'encouragement'),
 ('thinking', 'revisionist'),
 ('thinking', 'manifestly'),
 ('thinking', 'fair'),
 ('manifestly', 'revisionist'),
 ('manifestly', 'thinking'),
 ('manifestly', 'fair'),
 ('manifestly', 'admit'),
 ('fair', 'thinking'),
 ('fair', 'manifestly'),
 ('fair', 'admit'),
 ('fair', 'fraternity'),
 ('admit', 'manifestly'),
 ('admit', 'fair'),
 ('admit', 'fraternity'),
 ('admit', 'constitutional'),
 ('fraternity', 'fair'),
 ('fraternity', 'admit'),
 ('fraternity', 'constitutional'),
 ('fraternity', 'right'),
 ('constitutional', 'admit'),
 ('constitutional', 'fraternity'),
 ('constitutional', 'right'),
 ('constitutional', 'refuse'),
 ('right', 'fraternity'),
 ('right', 'con

In [10]:
def encode_pairs(pairs: list[str], word_to_idx: dict):
    encoded_pairs = []
    for context, target in pairs:
        context_idx = word_to_idx[context]
        target_idx = word_to_idx[target]
        encoded_pairs.append((context_idx, target_idx))

    return encoded_pairs

encoded_pairs = encode_pairs(pairs, word_to_idx)
encoded_pairs[:5]

[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3)]