In [1]:
#!pip3 install wordfreq datasets numpy

In [11]:
import datasets
from collections import Counter
from wordfreq import word_frequency
import numpy as np
import re
from tqdm import tqdm

def download_datasets():
    datasets_info = [
        ("ajibawa-2023/General-Stories-Collection", "train")
    ]
    return {name: datasets.load_dataset(name, split=split) for name, split in datasets_info}

def parse_text(datasets):
    texts = []
    for example in tqdm(datasets["ajibawa-2023/General-Stories-Collection"]):
        texts.append(example['text'])
    return texts


def get_word_counts(texts, min_length=4):
    """
    Count word frequencies in a list of texts.

    Parameters:
    - texts (iterable of str): The input texts to process.
    - min_length (int): Minimum length of words to include.

    Returns:
    - Counter: A Counter object mapping words to their frequencies.
    """
    # Precompile the regex pattern for better performance
    # This pattern matches words with internal apostrophes (e.g., "couldn't")
    pattern = re.compile(r"\b\w+(?:'\w+)?\b")
    
    word_counts = Counter()
    
    for text in tqdm(texts, desc="Counting words"):
        if not isinstance(text, str):
            continue  # Skip non-string entries to make the function more robust
        
        # Convert to lowercase and find all matching words
        words = pattern.findall(text.lower())
        
        # Update counts with words that meet the minimum length
        word_counts.update(word for word in words if len(word) >= min_length)
    
    return word_counts


def analyze_word_rarity(word_counts):
    total_words = sum(word_counts.values())
    corpus_frequencies = {word: count / total_words for word, count in word_counts.items()}
    
    wordfreq_frequencies = {word: word_frequency(word, 'en') for word in word_counts.keys()}
    
    # Filter out words with zero frequency
    valid_words = [word for word, freq in wordfreq_frequencies.items() if freq > 0]
    
    corpus_freq_list = [corpus_frequencies[word] for word in valid_words]
    wordfreq_freq_list = [wordfreq_frequencies[word] for word in valid_words]
    
    # Calculate average rarity
    avg_corpus_rarity = np.mean([-np.log10(freq) for freq in corpus_freq_list])
    avg_wordfreq_rarity = np.mean([-np.log10(freq) for freq in wordfreq_freq_list])
    
    # Calculate correlation
    correlation = np.corrcoef(corpus_freq_list, wordfreq_freq_list)[0, 1]
    
    return corpus_frequencies, wordfreq_frequencies, avg_corpus_rarity, avg_wordfreq_rarity, correlation

def find_over_represented_words(corpus_frequencies, wordfreq_frequencies, top_n=50000):
    over_representation = {}
    for word in corpus_frequencies.keys():
        wordfreq_freq = wordfreq_frequencies[word]
        if wordfreq_freq > 0:  # Only consider words with non-zero frequency
            over_representation[word] = corpus_frequencies[word] / wordfreq_freq
    
    return sorted(over_representation.items(), key=lambda x: x[1], reverse=True)[:top_n]

def find_zero_frequency_words(word_counts, wordfreq_frequencies, top_n=50000):
    zero_freq_words = {word: count for word, count in word_counts.items() if wordfreq_frequencies[word] == 0}
    return sorted(zero_freq_words.items(), key=lambda x: x[1], reverse=True)[:top_n]



In [None]:

print("Downloading datasets...")
all_datasets = download_datasets()


In [None]:

print("Parsing text...")
texts = parse_text(all_datasets)


In [None]:

print(f"Total texts extracted: {len(texts)}")

print("Counting words...")
word_counts = get_word_counts(texts)


In [None]:

def filter_mostly_numeric(word_counts):
    def is_mostly_numbers(word):
        digit_count = sum(c.isdigit() for c in word)
        return digit_count / len(word) > 0.2  # Adjust this ratio if needed
    
    # Create a new Counter with filtered words
    return Counter({word: count for word, count in word_counts.items() if not is_mostly_numbers(word)})

filtered_counts = filter_mostly_numeric(word_counts)

print("Analyzing word rarity...")
corpus_frequencies, wordfreq_frequencies, avg_corpus_rarity, avg_wordfreq_rarity, correlation = analyze_word_rarity(filtered_counts)

print(f"Total unique words analyzed: {len(word_counts)}")
print(f"Average corpus rarity: {avg_corpus_rarity:.4f}")
print(f"Average wordfreq rarity: {avg_wordfreq_rarity:.4f}")
print(f"Correlation between corpus and wordfreq frequencies: {correlation:.4f}")

print("\nMost over-represented words in the corpus:")
over_represented = find_over_represented_words(corpus_frequencies, wordfreq_frequencies)
for word, score in over_represented:
    print(f"{word}: {score:.2f} times more frequent than expected")

print("\nMost frequent words with zero wordfreq frequency:")
zero_freq_words = find_zero_frequency_words(filtered_counts, wordfreq_frequencies)
for word, count in zero_freq_words:
    print(f"{word}: {count} occurrences")


In [13]:

import json
with open('over_represented_words.json', 'w') as f:
    json.dump(over_represented, f)
with open('frequent_non_dictionary_words.json', 'w') as f:
    json.dump(zero_freq_words, f)

In [None]:
corpus_frequencies['testament']