# imports

In [1]:
import re
from collections import Counter
import numpy as np
import pandas as pd

# Data Preprocessing

### get_count

In [7]:
def process_data(file_name):
  words = []

  # Open the file, read its contents into a string variable
  with open(file_name) as f:
    file_name_data = f.read()

  # Convert all letters to lower case
  file_name_data = file_name_data.lower()

  # Convert every word to lower case and return them in a list
  words = re.findall(r"\w+",file_name_data)

  return words

In [8]:
word_l = process_data("/content/shakespeare.txt")
vocab = set(word_l)  # this will be your new vocabulary
print(f"The first ten words in the text are: \n{word_l[0:10]}")
print(f"There are {len(vocab)} unique words in the vocabulary.")

The first ten words in the text are: 
['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the']
There are 6116 unique words in the vocabulary.


In [9]:
# get_count
def get_count(word_l):
     # fill this with word counts
    word_count_dict = {}

    # Method 1
    word_count_dict = Counter(word_l)

    # Method 2
    #for word in word_l:
    #    word_count_dict[word] = word_count_dict.get(word,0) + 1

    return word_count_dict

In [10]:
word_count_dict = get_count(word_l)
print(f"There are {len(word_count_dict)} key values pairs")
print(f"The count for the word 'thee' is {word_count_dict.get('thee',0)}")

There are 6116 key values pairs
The count for the word 'thee' is 240


### get_probs

In [12]:
def get_probs(word_count_dict):

    probs = {}

    # get the total count of words for all words in the dictionary
    total_words = sum(word_count_dict.values())
    for key in word_count_dict.keys():
        probs[key] = word_count_dict[key] / total_words

    return probs

In [13]:
probs = get_probs(word_count_dict)
print(f"Length of probs is {len(probs)}")
print(f"P('thee') is {probs['thee']:.4f}")

Length of probs is 6116
P('thee') is 0.0045


# String Manipulations

### delete letter

In [14]:
def delete_letter(word, verbose=False):
    delete_l = []
    split_l = []

    for i in range(len(word)):
        split_l.append([word[:i],word[i:]])

    for L,R in split_l:
        delete_l.append(L + R[1:])

    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")

    return  delete_l

In [15]:
delete_word_l = delete_letter(word="cans",
                        verbose=True)

input word cans, 
split_l = [['', 'cans'], ['c', 'ans'], ['ca', 'ns'], ['can', 's']], 
delete_l = ['ans', 'cns', 'cas', 'can']


### switch_letter

In [16]:
def switch_letter(word, verbose=False):

    switch_l = []
    split_l = []

    split_l = [[word[:i], word[i:]] for i in range(len(word))]

    switch_l = [L + R[1] + R[0] + R[2:] for L,R in split_l if len(R) >= 2]

    if verbose: print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}")

    return switch_l

In [17]:
switch_word_l = switch_letter(word="eta",
                         verbose=True)

Input word = eta 
split_l = [['', 'eta'], ['e', 'ta'], ['et', 'a']] 
switch_l = ['tea', 'eat']


### replace_letter