In [34]:
from collections import defaultdict

In [35]:
import operator
def build_conditional_probabilities(corpus):
    """
    The function takes as its input a corpus string (words separated by 
    spaces) and returns a 2D dictionnary of probabilities P(next|current) of
    seeing a word "next" conditionnaly to seeing a word "current". 
    """

# First we parse the string to build a double dimension dictionnary that
# returns the conditional probabilities.

# We parse the string to build a first dictionnary indicating for each
# word, what are the words that follow it in the string. Repeated next
# words are kept so we use a list and not a set. 

    tokenized_string = corpus.split()
    print(tokenized_string)
    previous_word = ""
    dictionnary = defaultdict(list)

    for current_word in tokenized_string:
        if previous_word != "":
            dictionnary[previous_word].append(current_word)
        previous_word = current_word
    print(dictionnary)
# We know parse dictionnary to compute the probability each observed
# next word for each word in the dictionnary. 

    for key in dictionnary.keys():
        next_words = dictionnary[key]
        unique_words = set(next_words) # removes duplicated
        nb_words = len(next_words)
        probabilities_given_key = {}
        for unique_word in unique_words:
            probabilities_given_key[unique_word] =float(next_words.count(unique_word)) / nb_words
            #dictionnary[key] = sorted(key.items(), key=operator.itemgetter(1))
            dictionnary[key] = probabilities_given_key
    return dictionnary


In [36]:

def bigram_next_word_predictor(conditional_probabilities, current, next_candidate):
    """
    The function takes as its input a 2D dictionnary of probabilities 
    P(next|current) of seeing a word "next" conditionnaly to seeing a word 
    "current", the current word being read, and a next candidate word, and
    returns P(next_candidate|current).
    """

# We look for the probability corresponding to the 
# current -> next_candidate pair

    if conditional_probabilities[current]:
        if conditional_probabilities[current][next_candidate]:
            return conditional_probabilities[current][next_candidate]

# If current -> next_candidate pair has not been observed in the corpus,
# the corresponding dictionnary keys will not be defined. We return 
# a probability 0.0


    return 0.0




In [50]:
def bigram_next_word(conditional_probabilities, current):
    """
    The function takes as its input a 2D dictionnary of probabilities 
    P(next|current) of seeing a word "next" conditionnaly to seeing a word 
    "current", the current word being read, and a next candidate word, and
    returns P(next_candidate|current).
    """

# We look for the probability corresponding to the 
# current -> next_candidate pair

    if conditional_probabilities[current]:
        #print(type)
        return max(conditional_probabilities[current].items(), key=operator.itemgetter(1))[0]
        #return conditional_probabilities[current]

# If current -> next_candidate pair has not been observed in the corpus,
# the corresponding dictionnary keys will not be defined. We return 
# a probability 0.0


    return "not avaialble"



In [52]:

# An example corpus to try out the function
corpus = '''Avul Pakir Jainulabdeen Abdul Kalam better known as A. P. J. Abdul Kalam (/ˈæbdʊl kəˈlɑːm/ (About this sound listen); 15 October 1931 – 27 July 2015), was the 11th President of India from 2002 to 2007. A career scientist turned statesman, Kalam was born and raised in Rameswaram, Tamil Nadu, and studied physics and aerospace engineering. He spent the next four decades as a scientist and science administrator, mainly at the Defence Research and Development Organisation (DRDO) and Indian Space Research Organisation (ISRO) and was intimately involved in India's civilian space programme and military missile development efforts.[1] He thus came to be known as the Missile Man of India for his work on the development of ballistic missile and launch vehicle technology.[2][3][4] He also played a pivotal organisational, technical, and political role in India's Pokhran-II nuclear tests in 1998, the first since the original nuclear test by India in 1974.[5]
Kalam was elected as the 11th President of India in 2002 with the support of both the ruling Bharatiya Janata Party and the then-opposition Indian National Congress. Widely referred to as the "People's President,"[6] he returned to his civilian life of education, writing and public service after a single term. He was a recipient of several prestigious awards, including the Bharat Ratna, India's highest civilian honour. While delivering a lecture at the Indian Institute of Management Shillong, Kalam collapsed and died from an apparent cardiac arrest on 27 July 2015, aged 83.[7] Thousands including national-level dignitaries attended the funeral ceremony held in his hometown of Rameshwaram, where he was buried with full state honours.[8]'''
# We call the conditional probability dictionnary builder function
conditional_probabilities = build_conditional_probabilities(corpus)

['Avul', 'Pakir', 'Jainulabdeen', 'Abdul', 'Kalam', 'better', 'known', 'as', 'A.', 'P.', 'J.', 'Abdul', 'Kalam', '(/ˈæbdʊl', 'kəˈlɑːm/', '(About', 'this', 'sound', 'listen);', '15', 'October', '1931', '–', '27', 'July', '2015),', 'was', 'the', '11th', 'President', 'of', 'India', 'from', '2002', 'to', '2007.', 'A', 'career', 'scientist', 'turned', 'statesman,', 'Kalam', 'was', 'born', 'and', 'raised', 'in', 'Rameswaram,', 'Tamil', 'Nadu,', 'and', 'studied', 'physics', 'and', 'aerospace', 'engineering.', 'He', 'spent', 'the', 'next', 'four', 'decades', 'as', 'a', 'scientist', 'and', 'science', 'administrator,', 'mainly', 'at', 'the', 'Defence', 'Research', 'and', 'Development', 'Organisation', '(DRDO)', 'and', 'Indian', 'Space', 'Research', 'Organisation', '(ISRO)', 'and', 'was', 'intimately', 'involved', 'in', "India's", 'civilian', 'space', 'programme', 'and', 'military', 'missile', 'development', 'efforts.[1]', 'He', 'thus', 'came', 'to', 'be', 'known', 'as', 'the', 'Missile', 'Man', 

In [39]:


# Some sample queries to the bigram predictor
print(bigram_next_word_predictor(conditional_probabilities, "the", "cat")) 
print(bigram_next_word_predictor(conditional_probabilities, "is", "red")) 
print(bigram_next_word_predictor(conditional_probabilities, "", "red")) 
 




0.75
0.25
0.0


In [70]:
print(bigram_next_word(conditional_probabilities, "civilian"))

life
