Margarita Undalova

01/1365289

This is the file that contains functions to perform all the necessary tasks. 
This module contains functions for analyzing text data. Includes loading documents, computing statistics, term frequencies, type-token ratios, n-grams, and collocations.

Modular functions:

- load_documents: loads text documents from a specified directory.
- compute_statistics: computes basic statistics such as line counts and token counts.
- term_frequency: computes term frequencies for a given text.
- compute_term_frequencies: computes term frequencies for all documents and global term frequencies.
- type_token_ratio: computes type-token ratios for a given text.
- compute_ttr_for_documents: computes type-token ratios for all documents and the entire corpus.
- generate_ngrams: generates n-grams for a given text.
- compute_ngrams_for_documents: calculates n-grams for all documents and the entire corpus.
- generate_concordance: generates a concordance for a given word in the text.
- find_collocations: finds collocations (bigrams or trigrams) in the text.
- generate_unique_words: generates unique words from the text.
- save_unique_words_to_file: saves unique words to a file.

Note: The `nltk` and `matplotlib` libraries are used to work with text.


In [3]:
import os
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import nltk

nltk.download('punkt')

def load_documents(directory):
    """
    Loads text documents from the specified directory.
    Arguments: 
    directory (str): Path to the directory containing the text files.
    Returns: 
    dict: A dictionary where the keys are the file names and the values are the file contents.
    """
    documents = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                documents[filename] = text
    return documents

def compute_statistics(documents):
    """
    Computes basic statistics including number of lines and tokens.
    Arguments: 
    documents (dict): A dictionary of documents, where keys are file names and values are text.
    Returns:
    dict: A dictionary of statistics, including the total number of documents, lines, tokens, and statistics for each document.
    """
    stats = {
        'total_documents': len(documents),
        'total_lines': 0,
        'total_tokens': 0,
        'document_stats': {}
    }
    for filename, text in documents.items():
        lines = text.splitlines()
        tokens = word_tokenize(text)
        doc_stats = {
            'num_lines': len(lines),
            'num_tokens': len(tokens)
        }
        stats['total_lines'] += doc_stats['num_lines']
        stats['total_tokens'] += doc_stats['num_tokens']
        stats['document_stats'][filename] = doc_stats
    return stats

def term_frequency(text):
    """
    Computes term frequency for a given text.
    Arguments: text (str): The text to analyze.
    Returns: 
    collections.Counter: A dictionary where the keys are words, and the values are their frequencies.
    """
    tokens = word_tokenize(text.lower())
    words = [word for word in tokens if word.isalpha()]
    return Counter(words)

def compute_term_frequencies(documents):
    """
    Computes term frequencies for all documents and global term frequencies.
    Arguments: 
    documents (dict): A dictionary of documents, where keys are file names and values are text.
    Returns: 
    tuple: A tuple containing two elements:
    - dict: A dictionary of term frequencies for each document.
    - collections.Counter: Global term frequencies for the entire corpus.

    """
    global_term_freq = Counter()
    document_term_frequencies = {}
    for filename, text in documents.items():
        term_freq = term_frequency(text)
        document_term_frequencies[filename] = term_freq
        global_term_freq.update(term_freq)
    return document_term_frequencies, global_term_freq

def type_token_ratio(text):
    """
    Computes the type-token ratio for a given text.
    Arguments: 
    text (str): Text to analyze.
    Returns: 
    float: Ratio of unique words to total words.
    """
    tokens = word_tokenize(text.lower())
    words = [word for word in tokens if word.isalpha()]
    num_tokens = len(words)
    num_types = len(set(words))
    return num_types / num_tokens if num_tokens > 0 else 0

def compute_ttr_for_documents(documents):
    """
    Computes the type-token ratio for all documents and for the entire corpus.
    Arguments: 
    documents (dict): A dictionary of documents, where keys are filenames and values are text.
    Returns:
    tuple: A tuple containing two elements:
    - dict: A dictionary of type-to-token mappings for each document.
    - float: The type-to-token mappings for the entire corpus.
    """
    document_ttrs = {}
    combined_words = []
    for filename, text in documents.items():
        ttr = type_token_ratio(text)
        document_ttrs[filename] = ttr
        combined_words.extend(word_tokenize(text.lower()))
    global_ttr = type_token_ratio(' '.join(combined_words))
    return document_ttrs, global_ttr

def generate_ngrams(text, n):
    """
    Generates n-grams for a given text.
    Arguments:
    text (str): Text to analyze.
    n (int): Size of n-grams (e.g. 1 for unigrams, 2 for bigrams, etc.).
    Returns: collections.
    Counter: A dictionary where keys are n-grams and values are their frequencies.
    """
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    n_grams = ngrams(tokens, n)
    return Counter(n_grams)

def compute_ngrams_for_documents(documents):
    """
    Computes n-grams for all documents and for the entire corpus.
    Computes n-grams for all documents and the entire corpus.
    Arguments: 
    documents (dict): A dictionary of documents, where keys are filenames and values are text.
    Returns:
    tuple: A tuple containing two elements:
    - dict: A dictionary of n-grams (unigrams, bigrams, and trigrams) for each document.
    - dict: A dictionary of n-grams (unigrams, bigrams, and trigrams) for the entire corpus.
    """
    document_ngrams = {}
    combined_tokens = []
    for filename, text in documents.items():
        unigrams = generate_ngrams(text, 1)
        bigrams = generate_ngrams(text, 2)
        trigrams = generate_ngrams(text, 3)
        document_ngrams[filename] = {
            'unigrams': unigrams,
            'bigrams': bigrams,
            'trigrams': trigrams
        }
        combined_tokens.extend(word_tokenize(text.lower()))
    combined_text = ' '.join(combined_tokens)
    corpus_ngrams = {
        'unigrams': generate_ngrams(combined_text, 1),
        'bigrams': generate_ngrams(combined_text, 2),
        'trigrams': generate_ngrams(combined_text, 3)
    }
    return document_ngrams, corpus_ngrams

def generate_concordance(text, word, window=5):
    """
    Generates concordance for a given word in a text.
    Arguments: 
    text (str): Text to analyze.
    word (str): Word to search for in text.
    window (int): Window size for context.
    Returns:
    list: List of tuples, each containing a left context, a search word, and a right context.
    """
    tokens = word_tokenize(text.lower())
    text_obj = nltk.Text(tokens)
    concordance_list = []
    for index in range(len(tokens)):
        if tokens[index] == word.lower():
            left_context = tokens[max(index - window, 0):index]
            right_context = tokens[index + 1:min(index + window + 1, len(tokens))]
            concordance_list.append((' '.join(left_context), word, ' '.join(right_context)))
    return concordance_list

def find_collocations(text, n):
    """
    Finds collocations (bigrams or trigrams) in a text.
    Arguments:
    text (str): Text to analyze.
    n (int): Size of collocations (2 for bigrams, 3 for trigrams).
    Returns:
    list: List of tuples, where each tuple contains an n-gram and its score.
    """
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    if n == 2:
        finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(nltk.metrics.BigramAssocMeasures.likelihood_ratio)
    elif n == 3:
        finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(nltk.metrics.TrigramAssocMeasures.likelihood_ratio)
    else:
        raise ValueError("Only bigrams and trigrams are supported")
    return scored

def generate_unique_words(text):
    """
    Generates unique words from the text.
    Arguments:
    text (str): Text to analyze.
    Returns:
    set: Set of unique words in the text.
    """
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    return set(tokens)

def save_unique_words_to_file(filename, unique_words):
    """
    Save unique words to a file.
    Arguments:
    filename (str): Path to file to save unique words.
    unique_words (set): Set of unique words to save.
    """
    with open(filename, 'w') as file:
        for word in sorted(unique_words):
            file.write(f"{word}\n")


[nltk_data] Downloading package punkt to
[nltk_data]     /users/margarita.undalova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
