In [168]:
import re
from nltk.tokenize import sent_tokenize
import xml.etree.ElementTree as ET
import torch.nn as nn
import torch
import torch.nn.functional as F

raw_text = "Hello. (This is known as irony.) The U.S. is amazing. Those couples are still together, but Jayre and Carolyn called it quits.This is my world. HAPPY BIRTHDAY KATE!!!!! Welcome to my world. Everything is fine? Is ther anything i can do for you. He's one a kind. There's something you should know: i'm here; but he is not there."

def clean_sentence(sentence):
    # Remove punctuation marks from the sentence
    # cleaned_sentence = sentence.translate(str.maketrans("", "", string.punctuation))
    # cleaned_sentence = re.sub(r"[^\w\s']+", "", sentence)
    return re.sub(r"[^\w\s']+", "", sentence)

def text_to_sentences(text):
    # Split the text into sentences using regular expressions
    text_without_parentheses = re.sub(r'\([^)]*\)', '', text)
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text_without_parentheses)
    return sentences

# for sent in sents:
#     print(sent)

# print()
# for sent in map(clean_sentences, sents):
#     print(sent)
# for sent in sents:
#     print(sent)



def get_sentences(raw_text):
    sentences = sent_tokenize(raw_text)
    return map(clean_sentence, sentences)
for s in get_sentences(raw_text):
    print(s)


Hello
This is known as irony
The US is amazing
Those couples are still together but Jayre and Carolyn called it quitsThis is my world
HAPPY BIRTHDAY KATE
Welcome to my world
Everything is fine
Is ther anything i can do for you
He's one a kind
There's something you should know i'm here but he is not there


In [94]:
def extract_posts_from_xml(xml_content) -> str:
    posts = []
    
    # Parse the XML content
    xml_content = re.sub(r'&nbsp;', ' ', xml_content)
    root = ET.fromstring(xml_content)
    
    # Find all <post> elements
    post_elements = root.findall('.//post')
    
    # Extract the text from each <post> element
    for post_element in post_elements:
        post_text = post_element.text.strip()
        posts.append(post_text)
    
    return " ".join(posts)

filename = "./blogs/980769.male.25.indUnk.Capricorn.xml"
with open(filename, 'r', encoding='utf-8') as f:
    text = f.read()

posts = extract_posts_from_xml(text)
all_sentences = list(get_sentences(posts))
print(all_sentences[1])
words = all_sentences[1].split()




# for x, y in zip(xs, ys):
#     print(f"{x} --> {y}")



Something similar should happen in religious debate to people who talk about Torahbased Judaism


In [213]:
from typing import List

# creating vocab
vocab_set = set()
sentences = all_sentences
for sentence in sentences:
    for word in sentence.split():
        vocab_set.add(word)
vocab = sorted(list(vocab_set))

vocab_size = len(vocab)
w2i = {w:i for i, w in enumerate(vocab)}
side_spread = 2
batch_size = 32
n_embed = 10
n_hidden = 10


def get_context_n_target(words):
    X, Y = [], []
    if len(words) < side_spread + 1:
        return X, Y
    
    for i in range(len(words)):
        l = max(0, i - side_spread)
        r = min(i + side_spread, len(words) - 1)
        l = max(l, i - (r - i))
        r = min(r, i + (i - l))
        if i == r:
            continue
        x = list(map(w2i.get, [*words[l:i], *words[i+1:r+1]]))
        X.append(x)

        y = w2i[words[i]]
        Y.append(y)
    return torch.tensor(X), torch.tensor(Y)

def get_x_y(words):
    window = 2*side_spread + 1
    if len(words) < window:
        return [], []
    x, y = [], []
    for i in range(len(words) - window + 1):
        context_words = [*words[i:i+side_spread], *words[i+side_spread+1:i+2*side_spread+1]]
        target_word = words[i+side_spread]
        x.append([w2i[w] for w in context_words])
        y.append(w2i[target_word])
    
    return x, y
    


def get_all_data(sentences: List[str]):
    all_x, all_y = [], []
    for sentence in sentences:
        words = sentence.split()
        x, y = get_x_y(words)
        all_x.extend(x)
        all_y.extend(y)
    return torch.tensor(all_x), torch.tensor(all_y)

# splitting training and validation sets
X, Y = get_all_data(all_sentences)
# X, Y = get_context_n_target(words)
n = int(0.9 * vocab_size)
train_x, train_y = X[:n], Y[:n]
val_x, val_y = X[n:], Y[n:]


g = torch.Generator().manual_seed(1337)
def get_batch(split):
    x, y = (train_x, train_y) if 'train' == split else (val_x, val_y)
    idx = torch.randint(len(x), (batch_size, ), generator=g)
    return x[idx], y[idx]

class CBOW(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embed)
        self.proj = nn.Sequential(
            nn.Linear(n_embed, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, vocab_size),
        )


    def forward(self, x: torch.Tensor, target: torch.Tensor = None):
        embeddings: torch.Tensor = self.embedding(x)
        summed = embeddings.mean(1, keepdim=False) # (B, n_embed)
        logits = self.proj(summed) # (B, vocab_size)
        loss = None
        if target is not None:
            loss = F.cross_entropy(logits, target)

        return logits, loss

@torch.no_grad()
def estimate_loss(model: nn.Module, eval_iters: int):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


epochs = 20000
def training(eval_iters: int):
    m = CBOW()
    n_param = sum(p.numel() for p in m.parameters())
    print(f'{n_param=}')
    optim = torch.optim.SGD(m.parameters(), lr=1e-2)
    for i in range(epochs):
        # get batch
        x, y = get_batch('train')

        # forward
        _, loss = m(x, y)
        if i % eval_iters == 0 or i == epochs - 1:
            losses = estimate_loss(m, eval_iters)
            print(f"step: {i} training loss: {losses['train']:.4f} valuation loss: {losses['val']:.4f}")


        # backward
        optim.zero_grad(set_to_none=True)
        loss.backward()

        # update
        optim.step()
    
    return m

m = training(100)

step: 0 training loss: 7.6365 valuation loss: 7.6253
step: 100 training loss: 7.6176 valuation loss: 7.6072
step: 200 training loss: 7.6027 valuation loss: 7.5901
step: 300 training loss: 7.5965 valuation loss: 7.5740
step: 400 training loss: 7.5776 valuation loss: 7.5609
step: 500 training loss: 7.5675 valuation loss: 7.5590
step: 600 training loss: 7.5440 valuation loss: 7.5273
step: 700 training loss: 7.5161 valuation loss: 7.5310
step: 800 training loss: 7.5090 valuation loss: 7.5210
step: 900 training loss: 7.4945 valuation loss: 7.4980
step: 1000 training loss: 7.4777 valuation loss: 7.4879
step: 1100 training loss: 7.4865 valuation loss: 7.4879
step: 1200 training loss: 7.4715 valuation loss: 7.4608
step: 1300 training loss: 7.4401 valuation loss: 7.4321
step: 1400 training loss: 7.4095 valuation loss: 7.4303
step: 1500 training loss: 7.3977 valuation loss: 7.4064
step: 1600 training loss: 7.3541 valuation loss: 7.3870
step: 1700 training loss: 7.3504 valuation loss: 7.3780
step

tensor(10.5176, grad_fn=<NllLossBackward0>)