In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!git clone https://github.com/zash13/intro-to-ir.git code
!cd ./code && ls && git lfs pull 
%cd code/src

Cloning into 'code'...
remote: Enumerating objects: 127, done.[K
remote: Counting objects: 100% (127/127), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 127 (delta 54), reused 113 (delta 40), pack-reused 0 (from 0)[K
Receiving objects: 100% (127/127), 1.35 MiB | 12.54 MiB/s, done.
Resolving deltas: 100% (54/54), done.
Filtering content: 100% (2/2), 101.61 MiB | 35.83 MiB/s, done.
cbow2_loss.png	dataset		pyrightconfig.json  src
cbow_loss.png	loss_curve.png	README.md
/kaggle/working/code/src


In [3]:
!pwd

/kaggle/working/code/src


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from embeding_model import CBOW2, SkipGram
from collections import Counter
from tqdm import tqdm

BASE_DIR = os.getcwd()

DATASET_PATH = os.path.join(BASE_DIR, "..", "dataset", "Text8", "text8.txt")
DATASET_PATH = os.path.normpath(DATASET_PATH)


class Token:
    def __init__(self, vocab) -> None:
        self.special_tokens = {
            "<#START>": 0,
            "<#PAD>": 1,
            "<#UNKNOWN>": 2,
            "<#END>": 3,
        }
        self.token_map = self._generate_token_map(vocab)

    def _generate_token_map(self, vocab) -> dict[str, int]:
        token_map = {
            word: (idx + len(self.special_tokens)) for idx, word in enumerate(vocab)
        }
        return {**self.special_tokens, **token_map}

    def get_token_map(self):
        return self.token_map

    def tokenize(self, input):
        result = []
        for word in input.split():
            result.append(self.token_map.get(word, self.token_map["<#UNKNOWN>"]))
        return [self.token_map["<#START>"]] + result + [self.token_map["<#END>"]]

    def binary_vector(self, token_list: list[int]):
        result_list = np.zeros(len(self.token_map), dtype=int)
        for token in token_list:
            if 0 <= token < len(self.token_map):
                result_list[token] = 1

        return result_list

    @staticmethod
    def clean_input(input):
        input = (
            input.replace(",", "")
            .replace("!", "")
            .replace("?", "")
            .replace("(", "")
            .replace(")", "")
            .replace(":", "")
        )
        return input


class CSVStorage:
    @staticmethod
    def save(df, filename, index=False):
        df.to_csv(filename, index=index)

    @staticmethod
    def load(filename):
        if not os.path.exists(filename):
            raise FileNotFoundError(f"The file {filename} does not exist")
        return pd.read_csv(filename)


def plot_loss(loss_values, filename="loss_curve.png"):
    plt.figure(figsize=(10, 6))
    plt.plot(loss_values, label="Training Loss", color="blue")
    plt.title("Training Loss Over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.grid(True)
    plt.legend()
    plt.savefig(filename)
    plt.close()


def generate_cbow_skipgram_data(
    tokenizer: "Token", sentences, window_size, vocab_size, model_type="cbow"
):
    # if window size is even
    if window_size % 2 == 0:
        return False, False
    half_window = (window_size - 1) / 2
    half_window = int(half_window)
    cbow_inputs, cbow_targets = [], []
    cbow_inputs2, cbow_targets2 = [], []
    skipgram_inputs, skipgram_targets = [], []

    for sentence in sentences:
        tokenized = tokenizer.tokenize(sentence)
        length = len(tokenized)

        # for example if i have 5 word and window_size is 3 , then i will get
        # pad word0 word1
        # word0 word1 word2
        # word1 word2 word3
        # ....
        # word4 word5 pad
        # while always middle word is target
        # so i will have 5 target (each word have chance to be a target )
        # len - 2 , case i have 2 pad , in beggening and end
        for idx in range(half_window, length - 2):
            context = (
                tokenized[idx - half_window : idx]
                + tokenized[idx + 1 : idx + half_window + 1]
            )
            target = tokenized[idx]

            binary_vector_input = tokenizer.binary_vector(context)
            binary_vector_target = tokenizer.binary_vector([target])
            if model_type == "cbow":
                cbow_inputs.append(binary_vector_input)
                cbow_targets.append(binary_vector_target)
                cbow_inputs2.append(context)
                cbow_targets2.append(target)
            elif model_type == "skipgram":
                for context_word in context:
                    skipgram_inputs.append(binary_vector_target)
                    skipgram_targets.append(binary_vector_input)

    if model_type == "cbow":
        return (cbow_inputs2, cbow_targets2)
    else:
        return skipgram_inputs, skipgram_targets


# preprocess data
data = []
with open(DATASET_PATH) as f:
    data = f.read()
print(type(data))
# output : <class 'str'>
data_clean = Token.clean_input(data)
words_list = data.lower().split()
unique_words_list = set(words_list)
unique_words_list = sorted(unique_words_list)
vocab = [Token.clean_input(word) for word in unique_words_list]
vocab = list(set(vocab))
vocab_size = len(vocab)
print(vocab_size, vocab[:10])
# output : 253854 ['excellite', 'spins', 'supertoys', 'xdarwin', 'neuharth', 'strettodimessina', 'cowpuncher', 'bloomington', 'rounding', 'operaci']
tokenhelper = Token(vocab)
toekn_map = tokenhelper.get_token_map()
print(type(toekn_map))
print(toekn_map.get("excellite", " "))
# output : <class 'dict'> 187747

# data set hase no abbility to become list of sentences , its just words , without anything that help me to seprate them into sentences
max_word = 20  # words per sentence
chunk_size = 100  # jumber of sentences to process at once
min_word_freq = 5

# update 1 :
# still have problem with size of vocab ,
# so i try to use tf to find stopwords and words that have low frequency
#
stopwords = [
    "a",
    "about",
    "above",
    "after",
    "again",
    "against",
    "all",
    "am",
    "an",
    "and",
    "any",
    "are",
    "aren't",
    "as",
    "at",
    "be",
    "because",
    "been",
    "before",
    "being",
    "below",
    "between",
    "both",
    "but",
    "by",
    "can't",
    "cannot",
    "could",
    "couldn't",
    "did",
    "didn't",
    "do",
    "does",
    "doesn't",
    "doing",
    "don't",
    "down",
    "during",
    "each",
    "few",
    "for",
    "from",
    "further",
    "had",
    "hadn't",
    "has",
    "hasn't",
    "have",
    "haven't",
    "having",
    "he",
    "he'd",
    "he'll",
    "he's",
    "her",
    "here",
    "here's",
    "hers",
    "herself",
    "him",
    "himself",
    "his",
    "how",
    "how's",
    "i",
    "i'd",
    "i'll",
    "i'm",
    "i've",
    "if",
    "in",
    "into",
    "is",
    "isn't",
    "it",
    "it's",
    "its",
    "itself",
    "let's",
    "me",
    "more",
    "most",
    "mustn't",
    "my",
    "myself",
    "no",
    "nor",
    "not",
    "of",
    "off",
    "on",
    "once",
    "only",
    "or",
    "other",
    "ought",
    "our",
    "ours",
    "ourselves",
    "out",
    "over",
    "own",
    "same",
    "shan't",
    "she",
    "she'd",
    "she'll",
    "she's",
    "should",
    "shouldn't",
    "so",
    "some",
    "such",
    "than",
    "that",
    "that's",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "there",
    "there's",
    "these",
    "they",
    "they'd",
    "they'll",
    "they're",
    "they've",
    "this",
    "those",
    "through",
    "to",
    "too",
    "under",
    "until",
    "up",
    "very",
    "was",
    "wasn't",
    "we",
    "we'd",
    "we'll",
    "we're",
    "we've",
    "were",
    "weren't",
    "what",
    "what's",
    "when",
    "when's",
    "where",
    "where's",
    "which",
    "while",
    "who",
    "who's",
    "whom",
    "why",
    "why's",
    "will",
    "with",
    "won't",
    "would",
    "wouldn't",
    "you",
    "you'd",
    "you'll",
    "you're",
    "you've",
    "your",
    "yours",
    "yourself",
    "yourselves",
]
# update2 : this is so slow , it not work for this many words
# tf_doc = {word: words_list.count(word) for word in vocab}
word_counts = Counter(words_list)
tf_doc = {word: word_counts[word] for word in vocab}
for word in tf_doc:
    if tf_doc.get(word, 0) < min_word_freq:
        stopwords.append(word)
stopwords_set = set(stopwords)
filter_dataset = [
    word
    for word in tqdm(words_list, desc="Filtering words")
    if word not in stopwords_set
]
print(f"orginal dataset size {len(words_list)}")
print(f"filtered dataset size {len(filter_dataset)}")


def generate_sentences(words, words_per_sentence):
    for i in range(0, len(words), words_per_sentence):
        yield " ".join(words[i : i + words_per_sentence])


cbow_model = CBOW2(
    vocab_size=len(tokenhelper.token_map), window_size=3, embedding_size=300, epoch=20
)

all_loss = []
sentence_generator = generate_sentences(filter_dataset, max_word)

while True:
    chunk = []
    for _ in range(chunk_size):
        try:
            chunk.append(next(sentence_generator))
        except StopIteration:
            break

    if not chunk:
        break

    print(
        f"Processing chunk of {len(chunk)} sentences -> total words : {len(chunk) * len(chunk[0])}"
    )

    cbow_inputs, cbow_targets = generate_cbow_skipgram_data(
        tokenhelper,
        chunk,
        window_size=3,
        vocab_size=len(tokenhelper.token_map),
        model_type="cbow",
    )

    chunk_loss = cbow_model.fit(cbow_inputs, cbow_targets)
    all_loss.extend(chunk_loss)

plot_loss(all_loss, "cbow_loss.png")
y_pred = cbow_model.predict(["hello", "are"])


<class 'str'>
253854 ['mntv', 'apostelgeschichten', 'toponym', 'chaitya', 'kerin', 'assertive', 'magnifica', 'penchants', 'margravate', 'statuettes']
<class 'dict'>
184353


Filtering words: 100%|██████████| 17005207/17005207 [00:03<00:00, 5387026.18it/s]


orginal dataset size 17005207
filtered dataset size 10787112
Processing chunk of 100 sentences -> total words : 15200
(1900, 2) (1900,)
Epoch 1/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 2s/step - accuracy: 0.0365 - loss: 12.4339
Epoch 2/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 2s/step - accuracy: 0.1071 - loss: 12.2830
Epoch 3/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2s/step - accuracy: 0.0770 - loss: 11.7903
Epoch 4/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 2s/step - accuracy: 0.0684 - loss: 10.4727
Epoch 5/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2s/step - accuracy: 0.0713 - loss: 8.7682
Epoch 6/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 2s/step - accuracy: 0.1128 - loss: 6.9223
Epoch 7/20
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 2s/step - accuracy: 0.1289 - loss: 6.1269
Epoch 8/20
[1m60/60[0m [32m━━