Check the avg sentence length of the corpus to be fed to FastText, to make sure the window size makes sense (or we need to buffer 2-3 sentences together)

In [1]:
import os
import pandas as pd
import numpy as np
from gensim.utils import simple_preprocess
import sys
from tqdm import tqdm

# autoreload imports within same session when rerunning cell
%load_ext autoreload

%autoreload 2
# this is needed, cause notebooks not on same level as helpers
sys.path.insert(0, os.path.abspath(".."))  # assumes notebook is in {root}/notebooks/{fname}.ipynb
from helpers.data_fetchers import fetch_sl_stopwords

In [2]:
stopwords = fetch_sl_stopwords('../data/stopwords_sl.txt')

In [3]:
def compute_sentence_length_stats_np(directory, stopwords=None):
    """
    Computes mean and std of sentence lengths per text file using numpy.
    
    Args:
        directory (str): Path to directory with `.txt` files.
        stopwords (set, optional): Stopwords to remove.
        
    Returns:
        pd.DataFrame: Columns = txt_name, num_sentences, avg_sent_length, std_length
    """
    if stopwords:
        stopwords = set(simple_preprocess(' '.join(stopwords), deacc=True))

    data = []
    
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            sentence_lengths = []
            
            with open(filepath, 'r', encoding='utf-8') as file:
                for line in file:
                    tokens = simple_preprocess(line, deacc=True)
                    if stopwords:
                        tokens = [w for w in tokens if w not in stopwords]
                    if tokens:  # Only count non-empty sentences
                        sentence_lengths.append(len(tokens))
            
            if sentence_lengths:
                avg_len = np.mean(sentence_lengths)
                std_len = np.std(sentence_lengths, ddof=1) if len(sentence_lengths) > 1 else 0
                num_sent = len(sentence_lengths)
            else:
                avg_len, std_len, num_sent = 0, 0, 0

            if 'paragraph' in directory:
                unit = 'paragraph'
            elif 'sentence' in directory:
                unit = 'sentence'
            else:
                unit = 'x'
            
            data.append({
                "txt_name": filename,
                f"num_{unit}s": num_sent,
                f"avg_{unit}_length": avg_len,
                "std_length": std_len
            })
            
    df = pd.DataFrame(data)
    return df

In [4]:
df = compute_sentence_length_stats_np('../data/lemma_txt_corpus/sentence', stopwords)
df.to_csv('../output/avg_sent_length.csv')
df

100%|██████████| 281/281 [00:18<00:00, 15.00it/s]


Unnamed: 0,txt_name,num_sentences,avg_sentence_length,std_length
0,SHAME_92.txt,2024,6.180336,5.185019
1,SHAME_242.txt,2458,6.723352,4.719390
2,SHAME_136.txt,3605,6.893481,4.589688
3,SHAME_204.txt,2975,6.128403,3.977363
4,SHAME_108.txt,1490,7.095973,5.239161
...,...,...,...,...
276,SHAME_207.txt,1030,9.834951,5.957571
277,SHAME_40.txt,4800,5.699375,4.302512
278,SHAME_264.txt,2033,6.555337,4.156275
279,SHAME_27.txt,5444,6.767818,4.882385


In [5]:
df = compute_sentence_length_stats_np('../data/original_txt_corpus/paragraph', stopwords)
df.to_csv('../output/avg_paragraph_length.csv')
df

100%|██████████| 326/326 [00:16<00:00, 19.26it/s]


Unnamed: 0,txt_name,num_paragraphs,avg_paragraph_length,std_length
0,SHAME_92.txt,584,23.171233,38.613999
1,SHAME_242.txt,852,22.156103,25.511092
2,SHAME_136.txt,1339,21.616131,21.821101
3,SHAME_204.txt,1044,19.812261,26.652719
4,SHAME_108.txt,609,19.091954,28.541949
...,...,...,...,...
321,SHAME_40.txt,1785,18.274510,21.328383
322,SHAME_264.txt,858,18.413753,15.373761
323,SHAME_27.txt,2314,18.178911,23.161736
324,SHAME_68.txt,52,30.423077,28.863673
