In [1]:
import pandas as pd
import math
from collections import Counter
from numpy.random import random, choice
from tqdm import tqdm

import nltk
from nltk.corpus import brown

nltk.download('brown')

[nltk_data] Downloading package brown to /home/zach/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
df = pd.read_csv('./weapons_list.csv')
df

Unnamed: 0,Weapon,Class,Slot
0,Scattergun,Scout,Primary
1,Force-A-Nature,Scout,Primary
2,Shortstop,Scout,Primary
3,Soda Popper,Scout,Primary
4,Baby Face's Blaster,Scout,Primary
...,...,...,...
203,Ham Shank,All,Melee
204,Golden Frying Pan,All,Melee
205,Necro Smasher,All,Melee
206,Crossing Guard,All,Melee


In [None]:
class Bigram():
    START = '<s>'
    END = '</s>'
    
    def __init__(self, corpus):
        self.unigram_counts = Counter() #counts the occurrence of each word in the corpus
        self.unigram_probs = Counter() #calcs the probabilities of each word in the corpus
        self.bigram_counts = defaultdict(Counter) #counts the occurrence of each bigram (word1, word2) in the corpus
        self.bigram_probs = defaultdict(Counter) #calcs trhre probabilties of each bigram
        self.vocab_total = 0 #number of unique words in the corpus
        self.total_words = 0 #total number of words in the corpus
        
        self.set_counts(corpus) #use the set_counts (defined below) function to fill unigram_counts and bigram_counts
        self.set_probs() #use set_probs (defined below) to fill unigram_probs and bigram_probs
        
        '''
        Function:    format_sentence
        Arguments:   sentence
        Description: Applies START and END tokens to either end of a sentence in a corpus, and lowercases
                     each word in the sentence.
        '''
        def format_sentence(self, sentence):
            return [self.START] + [word.lower() for word in sentence] + [self.END]
        
        '''
        Function:    set_counts
        Arguments:   sent_list, a list of strings
        Description: Increments the occurrence of each unigram and bigram in sent_list by 1. 
        '''
        def set_counts(self, sent_list):
            for sentence in sent_list:
                sentence = self.format_sentence(sentence)
                for word in sentence:
                    self.unigram_counts[word] += 1
                    
            for bigram in zip(sentence[:-1], sentence[1:]):
                self.bigram_counts[bigram[0]][bigram[1]] += 1
                
            self.vocab_total = len(self.unigram_counts.keys())
            self.total_words = sum(self.unigram_counts.values())
        
        '''
        Function:    set_probs
        Arguments:   None
        Description: Converts the counts of each unigram and bigram into probabilities by dividing the number of times
                     they occur by the total number of words.
        '''
        def set_probs(self):
            self.unigram_probs = Counter({word: self.unigram_counts[word] / self.total_words for word in self.unigram_counts})
            
            for word1 in self.bigram_counts:
                for word2 in self.bigram_counts[word1]:
                    self.bigram_probs[word1][word2] = self.bigram_counts[word1][word2] / self.unigram_counts[word1]
                    
        '''
        Function:    get_counts
        Arguments:   None
        Description: Returns the bigram_counts dictionary, which holds the number of times each 2-gram word
                     combination occurs.
        '''
        def get_counts(self):
            return self.bigram_counts
        
        '''
        Function:    get_probs
        Arguments:   None
        Description: Returns the probabilities of each 2-gram word combination. This represents the probability
                     distribution for the given corpus.
        '''
        def get_probs(self):
            return self.bigram_probs