In [None]:
import nltk
from nltk import bigrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist, ConditionalFreqDist
from collections import defaultdict

# Download the 'punkt' resource
nltk.download('punkt')

# Sample text corpus
text_corpus = """
Natural language processing (NLP) is a field of artificial intelligence that deals with the interaction between computers and humans using natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human language in a way that is valuable. NLP is used to apply algorithms to identify and extract the natural language rules such that the unstructured language data is converted into a form that computers can understand.
"""

# Tokenize the text
tokens = word_tokenize(text_corpus.lower())

# Generate bigrams
bigrams_list = list(bigrams(tokens))

# Calculate frequency distribution for bigrams
bigram_freq = FreqDist(bigrams_list)

# Calculate the conditional frequency distribution
cfd = ConditionalFreqDist(bigrams_list)

# Calculate bigram probabilities
bigram_probabilities = defaultdict(dict)
for bigram in bigram_freq:
    first_word = bigram[0]
    second_word = bigram[1]
    bigram_probabilities[first_word][second_word] = cfd[first_word].freq(second_word)

# Display bigram probabilities
for first_word, second_word_probs in bigram_probabilities.items():
    for second_word, prob in second_word_probs.items():
        print(f"P({second_word} | {first_word}) = {prob:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


P(language | natural) = 1.0000
P(is | nlp) = 0.6667
P() | nlp) = 0.3333
P(processing | language) = 0.2000
P(. | language) = 0.2000
P(in | language) = 0.2000
P(rules | language) = 0.2000
P(data | language) = 0.2000
P(( | processing) = 1.0000
P(nlp | () = 1.0000
P(is | )) = 1.0000
P(a | is) = 0.2000
P(to | is) = 0.2000
P(valuable | is) = 0.2000
P(used | is) = 0.2000
P(converted | is) = 0.2000
P(field | a) = 0.3333
P(way | a) = 0.3333
P(form | a) = 0.3333
P(of | field) = 1.0000
P(artificial | of) = 0.5000
P(nlp | of) = 0.5000
P(intelligence | artificial) = 1.0000
P(that | intelligence) = 1.0000
P(deals | that) = 0.2500
P(is | that) = 0.2500
P(the | that) = 0.2500
P(computers | that) = 0.2500
P(with | deals) = 1.0000
P(the | with) = 1.0000
P(interaction | the) = 0.2500
P(ultimate | the) = 0.2500
P(natural | the) = 0.2500
P(unstructured | the) = 0.2500
P(between | interaction) = 1.0000
P(computers | between) = 1.0000
P(and | computers) = 0.3333
P(to | computers) = 0.3333
P(can | computers) 