In [1]:
import re
import math
import numpy as np
from collections import defaultdict
from typing import List, Tuple, Dict

# Tokenization and Preprocessing Functions
def process_data(data: str) -> Tuple[List[str], List[List[str]]]:
    """
    Tokenize the input data into sentences and words.
    """
    def tokenizer_sentence(sentence: str) -> List[str]:
        sentence = sentence.lower()
        sentence = re.sub(r"[^\w\s]", "", sentence)
        return sentence.split()

    sentences = data.split('\n')
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    tokenized_sentences = [tokenizer_sentence(sentence) for sentence in sentences]
    
    return sentences, tokenized_sentences

def count_words(tokenized_sentences: List[List[str]]) -> Dict[str, int]:
    """
    Count occurrences of each word in tokenized sentences.
    """
    word_counts = defaultdict(int)
    for sentence in tokenized_sentences:
        for token in sentence:
            word_counts[token] += 1
    return word_counts

def get_words_with_frequency_above_or_equal(tokenized_sentences: List[List[str]], threshold: int) -> List[str]:
    """
    Retrieve words that appear at least a certain number of times.
    """
    word_counts = count_words(tokenized_sentences)
    return [word for word, count in word_counts.items() if count >= threshold]

def replace_unknown_words_with_unk(tokenized_sentences: List[List[str]], closed_vocab: List[str], unk_token="<unk>") -> List[List[str]]:
    """
    Replace unknown words with a special token.
    """
    closed_vocab_set = set(closed_vocab)
    return [[token if token in closed_vocab_set else unk_token for token in sentence] for sentence in tokenized_sentences]

def count_n_grams(data: List[List[str]], n: int, start_token: str='<s>', end_token: str='</s>') -> Dict[Tuple[str, ...], int]:
    """
    Count all n-grams in the provided data.
    """
    n_grams = defaultdict(int)
    for sentence in data:
        sentence = [start_token] * (n-1) + sentence + [end_token]
        for i in range(len(sentence) - n + 1):
            n_gram = tuple(sentence[i:i+n])
            n_grams[n_gram] += 1
    return n_grams

def train_ngram_model(tokenized_sentences: List[List[str]], n: int, k: float) -> Dict[Tuple[str, ...], float]:
    """
    Train an n-gram model with add-k smoothing.
    """
    n_grams = count_n_grams(tokenized_sentences, n)
    vocab = set(token for sentence in tokenized_sentences for token in sentence)
    vocab_size = len(vocab)
    ngram_probs = {}
    for ngram, count in n_grams.items():
        context = ngram[:-1]
        word = ngram[-1]
        context_count = sum(n_grams[context_ngram] for context_ngram in n_grams if context_ngram[:-1] == context)
        prob = (count + k) / (context_count + k * vocab_size)
        ngram_probs[ngram] = prob
    return ngram_probs

def predict_ngram(sentence: str, ngram_model: Dict[Tuple[str, ...], float], n: int) -> float:
    """
    Predict the probability of a sentence using an n-gram model.
    """
    sentence = process_data(sentence)[1][0]  # Tokenize sentence
    sentence = ['<s>'] * (n - 1) + sentence + ['</s>']
    total_log_prob = 0.0
    for i in range(n - 1, len(sentence)):
        n_gram = tuple(sentence[i - n + 1:i + 1])
        total_log_prob += math.log(ngram_model.get(n_gram, 1e-10))  # Use a very small probability if n-gram is not found
    return total_log_prob

def generate_text(ngram_model: Dict[Tuple[str, ...], float], n: int, max_words: int = 100) -> str:
    """
    Generate text using an n-gram model.
    """
    current_ngram = ('<s>',) * (n - 1)
    result = []
    for _ in range(max_words):
        possible_words = [(ngram[-1], prob) for ngram, prob in ngram_model.items() if ngram[:-1] == current_ngram]
        if not possible_words:
            break
        words, probs = zip(*possible_words)
        next_word = np.random.choice(words, p=np.array(probs) / sum(probs))
        if next_word == '</s>':
            break
        result.append(next_word)
        current_ngram = current_ngram[1:] + (next_word,)
    return ' '.join(result)



hello world this is an example of ngram text generation


In [4]:
from collections import defaultdict

def process_file_incrementally(file_path, n):
    """
    Process the file line by line and update n-gram counts.
    """
    n_grams = defaultdict(int)
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = line.lower().split()
            tokens = ['<s>'] * (n-1) + tokens + ['</s>']
            for i in range(len(tokens) - n + 1):
                n_gram = tuple(tokens[i:i+n])
                n_grams[n_gram] += 1
    return n_grams

# Example usage
file_path = "C:\\Users\\hp\\id2\\NLP\\tp2 donnees\\PA_files\\big_data.txt"
n = 2  # For bigrams
n_gram_counts = process_file_incrementally(file_path, n)
print(dict(list(n_gram_counts.items())[:50]))  # Print first 10 n-gram counts for inspection

    

{('<s>', 'how'): 280, ('how', 'are'): 83, ('are', 'you?'): 44, ('you?', 'btw'): 1, ('btw', 'thanks'): 2, ('thanks', 'for'): 845, ('for', 'the'): 1461, ('the', 'rt.'): 12, ('rt.', 'you'): 1, ('you', 'gonna'): 18, ('gonna', 'be'): 103, ('be', 'in'): 114, ('in', 'dc'): 11, ('dc', 'anytime'): 1, ('anytime', 'soon?'): 4, ('soon?', 'love'): 1, ('love', 'to'): 126, ('to', 'see'): 507, ('see', 'you.'): 6, ('you.', 'been'): 1, ('been', 'way,'): 1, ('way,', 'way'): 1, ('way', 'too'): 33, ('too', 'long.'): 6, ('long.', '</s>'): 10, ('<s>', 'when'): 265, ('when', 'you'): 281, ('you', 'meet'): 6, ('meet', 'someone'): 3, ('someone', 'special...'): 1, ('special...', "you'll"): 1, ("you'll", 'know.'): 1, ('know.', 'your'): 1, ('your', 'heart'): 20, ('heart', 'will'): 3, ('will', 'beat'): 3, ('beat', 'more'): 2, ('more', 'rapidly'): 2, ('rapidly', 'and'): 2, ('and', "you'll"): 24, ("you'll", 'smile'): 2, ('smile', 'for'): 3, ('for', 'no'): 10, ('no', 'reason.'): 3, ('reason.', '</s>'): 5, ('<s>', "they

In [9]:
import random

def predict_next_word(previous_word, n_gram_counts, n=2):
    """
    Predict the next word based on the previous word(s) using the n-gram model.
    
    Args:
    previous_word (str): The previous word or words (context).
    n_gram_counts (dict): A dictionary of n-gram counts.
    n (int): The order of the n-gram model.

    Returns:
    str: The predicted next word.
    """
    # Create a list to store possible next words along with their probabilities
    candidates = []
    total_count = 0
    
    # Find all bigrams starting with the previous word
    for n_gram in n_gram_counts:
        if n_gram[0] == previous_word:
            candidates.append((n_gram[1], n_gram_counts[n_gram]))
            total_count += n_gram_counts[n_gram]
    
    if not candidates:
        return "No continuation found."
    
    # Normalize the counts to probabilities
    probabilities = [float(count) / total_count for _, count in candidates]
    
    # Choose a next word based on the probabilities
    next_word = random.choices([word for word, _ in candidates], weights=probabilities, k=1)[0]
    return next_word

# Example usage with a given previous word
previous_word = "see"
predicted_word = predict_next_word(previous_word, n_gram_counts)
print(f"After '{previous_word}', the next word might be '{predicted_word}'.")


After 'see', the next word might be 'you'.


In [10]:
previous_word = "black"
predicted_word = predict_next_word(previous_word, n_gram_counts)
print(f"After '{previous_word}', the next word might be '{predicted_word}'.")

After 'black', the next word might be 'cat.'.


In [17]:
previous_word = "blue"
predicted_word = predict_next_word(previous_word, n_gram_counts)
print(f"After '{previous_word}', the next word might be '{predicted_word}'.")

After 'blue', the next word might be 'suits,'.
