In [1]:
import re
from collections import Counter
import numpy as np
import pandas as pd

<ul>
    <li>Python <a href="https://docs.python.org/3/tutorial/inputoutput.html">input and output<a></li>
    <li>Python <a href="https://docs.python.org/3/library/re.html" >'re' documentation </a> </li>
</ul>
</p>

In [2]:
def process_data(file_name):
    """
    Input: 
        A file_name which is found in your current directory. You just have to read it in. 
    Output: 
        words: a list containing all the words in the corpus (text file you read) in lower case. 
    """
    words = [] # return this variable correctly

    
    #Open the file, read its contents into a string variable
    with open(file_name, 'r') as file:
        text = file.read()
    
    # convert all letters to lower case
    text = text.lower()
    
    #Convert every word to lower case and return them in a list.
    words = re.findall('\w+', text)
    
    return words

In [3]:
word_l = process_data('./data/shakespeare.txt')
vocab = set(word_l)  # this will be your new vocabulary
print(f"The first ten words in the text are: \n{word_l[0:10]}")
print(f"There are {len(vocab)} unique words in the vocabulary.")

The first ten words in the text are: 
['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the']
There are 6116 unique words in the vocabulary.


In [4]:
def get_count(word_l):
    '''
    Input:
        word_l: a set of words representing the corpus. 
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    '''
    
    word_count_dict = {}  # fill this with word counts

    for word in word_l:
        if word in word_count_dict:
            word_count_dict[word] += 1
        else:
            word_count_dict[word] = 1

    return word_count_dict

For example, given the following sentence: **"I am happy because I am learning"**, your dictionary should return the following: 
<table style="width:20%">

  <tr>
    <td> <b>Key </b>  </td>
    <td> <b>Value </b> </td> 


  </tr>
  <tr>
    <td> I  </td>
    <td> 2</td> 
 
  </tr>
   
  <tr>
    <td>am</td>
    <td>2</td> 
  </tr>

  <tr>
    <td>happy</td>
    <td>1</td> 
  </tr>
  
   <tr>
    <td>because</td>
    <td>1</td> 
  </tr>
  
   <tr>
    <td>learning</td>
    <td>1</td> 
  </tr>
</table>

In [6]:
word_count_dict = get_count(word_l)
print(f"There are {len(word_count_dict)} key values pairs")
print(f"The count for the word 'thee' is {word_count_dict.get('thee',0)}")

There are 6116 key values pairs
The count for the word 'thee' is 240


Given the dictionary of word counts, compute the probability that each word will appear if randomly selected from the corpus of words.

$$P(w_i) = \frac{C(w_i)}{M} \tag{Eqn-2}$$
where 

$C(w_i)$ is the total number of times $w_i$ appears in the corpus.

$M$ is the total number of words in the corpus.

For example, the probability of the word 'am' in the sentence **'I am happy because I am learning'** is:

$$P(am) = \frac{C(w_i)}{M} = \frac {2}{7} \tag{Eqn-3}.$$

**Instructions:** Implement `get_probs` function which gives you the probability 
that a word occurs in a sample. This returns a dictionary where the keys are words, and the value for each word is its probability in the corpus of words.

In [7]:
def get_probs(word_count_dict):
    '''
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  # return this variable correctly
    
    # get the total count of words for all words in the dictionary
    total_count = sum(word_count_dict.values())
    
    for word,count in word_count_dict.items():
        probs[word] = count/total_count

    return probs

In [8]:
probs = get_probs(word_count_dict)
print(f"Length of probs is {len(probs)}")
print(f"P('thee') is {probs['thee']:.4f}")

Length of probs is 6116
P('thee') is 0.0045


In [10]:
word_l

['o',
 'for',
 'a',
 'muse',
 'of',
 'fire',
 'that',
 'would',
 'ascend',
 'the',
 'brightest',
 'heaven',
 'of',
 'invention',
 'a',
 'kingdom',
 'for',
 'a',
 'stage',
 'princes',
 'to',
 'act',
 'and',
 'monarchs',
 'to',
 'behold',
 'the',
 'swelling',
 'scene',
 'then',
 'should',
 'the',
 'warlike',
 'harry',
 'like',
 'himself',
 'assume',
 'the',
 'port',
 'of',
 'mars',
 'and',
 'at',
 'his',
 'heels',
 'leash',
 'd',
 'in',
 'like',
 'hounds',
 'should',
 'famine',
 'sword',
 'and',
 'fire',
 'crouch',
 'for',
 'employment',
 'but',
 'pardon',
 'and',
 'gentles',
 'all',
 'the',
 'flat',
 'unraised',
 'spirits',
 'that',
 'have',
 'dared',
 'on',
 'this',
 'unworthy',
 'scaffold',
 'to',
 'bring',
 'forth',
 'so',
 'great',
 'an',
 'object',
 'can',
 'this',
 'cockpit',
 'hold',
 'the',
 'vasty',
 'fields',
 'of',
 'france',
 'or',
 'may',
 'we',
 'cram',
 'within',
 'this',
 'wooden',
 'o',
 'the',
 'very',
 'casques',
 'that',
 'did',
 'affright',
 'the',
 'air',
 'at',
 '