In [None]:
#| default_exp core

# core

> Fill in a module description here

In [None]:
#| export
from fastcore.script import *
from pathlib import Path
import gensim
from gensim.models import Word2Vec
import pandas as pd
import logging

In [None]:
# | export
def get_docs(
	dir_comments: str,  # directory containing parquet dataframes
	max_docs=None
	) -> pd.Series:
	fpaths = list(Path(dir_comments).glob('*.parquet'))
	df = pd.read_parquet(fpaths, columns=['body'])
	print('finished reading parquet files')
	if max_docs:
		df = df.sample(max_docs)
	docs = df['body'].str.split()
	print('finished tokenizing')
	return docs

In [None]:
#| notest
docs = get_docs('/Users/quirin/proj/getreddit/out/Coronavirus/', max_docs=5)

finished reading parquet files
finished tokenizing


# train model

In [None]:
#| export
class Corpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, docs):
        self.docs_clean = docs

    def __iter__(self):
        for doc in self.docs_clean:
            yield doc

In [None]:
#| export
class Word2VecLogger(gensim.models.callbacks.CallbackAny2Vec):
    # Initialize any variables or attributes here
    def __init__(self):
        self.epoch = 0 # Keep track of the current epoch number

    # Do something at the start of each epoch
    def on_epoch_begin(self, model):
        logging.info(f"Epoch {self.epoch} started")

    # Do something at the end of each epoch
    def on_epoch_end(self, model):
        logging.info(f"Epoch {self.epoch} finished")
        self.epoch += 1 # Increment the epoch number

    # Do something at the start of each batch
    def on_batch_begin(self, model):
        pass # You can add your own code here

    # Do something at the end of each batch
    def on_batch_end(self, model, cumulative_stats):
        # Get some statistics from cumulative_stats dictionary
        total_examples = cumulative_stats['total_examples']
        total_words = cumulative_stats['total_words']
        job = cumulative_stats['job']
        raw_words = cumulative_stats['raw_words']
        effective_words = cumulative_stats['effective_words']

        # Calculate and print some percentages using these statistics
        percentage_sentences = (job[1] - job[0]) / total_examples * 100
        percentage_words = raw_words / total_words * 100
        percentage_effective_words = effective_words / raw_words * 100

        logging.info(f"Batch processed {percentage_sentences:.2f}% sentences and {percentage_words:.2f}% words")
        logging.info(f"Batch used {percentage_effective_words:.2f}% words effectively for training")

In [None]:
#| export
def train_model(docs):
    logger = Word2VecLogger()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(
        docs,
        workers=8,
        min_count=5,
        window=5,
        epochs=5,
        vector_size=300,
        batch_words=10_000,
        callbacks=[logger]
    )
    return model

In [None]:
# | export
@call_parse
def train_model_pipe(
    dir_comments: str,  # Directory containing parquet dataframes
    max_docs: int = None,  # Maximum number of parquet files to be processed
    fp_model_out: str = None  # Save model to this file path
    ) -> Word2Vec:
    """
    Trains a word2vec model on the comments of a subreddit.
    """
    docs = get_docs(dir_comments, max_docs)
    corpus = Corpus(docs)
    model = train_model(corpus)
    if fp_model_out:
        model.save(fp_model_out)
    return model

In [None]:
#| notest
model = train_model_pipe('conspiracy', '/Users/quirin/proj/getreddit/out/conspiracy', max_docs=5)

finished reading parquet files


2023-02-17 11:01:21,137 : INFO : collecting all words and their counts
2023-02-17 11:01:21,138 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-02-17 11:01:21,138 : INFO : collected 123 word types from a corpus of 168 raw words and 5 sentences
2023-02-17 11:01:21,138 : INFO : Creating a fresh vocabulary
2023-02-17 11:01:21,139 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 3 unique words (2.44% of original 123, drops 120)', 'datetime': '2023-02-17T11:01:21.139077', 'gensim': '4.3.0', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 20 2021, 02:27:15) [Clang 11.1.0 ]', 'platform': 'macOS-13.1-arm64-arm-64bit', 'event': 'prepare_vocab'}
2023-02-17 11:01:21,139 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 18 word corpus (10.71% of original 168, drops 150)', 'datetime': '2023-02-17T11:01:21.139587', 'gensim': '4.3.0', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 20 2021, 02:27:15) 

finished tokenizing
