In [1]:
import re
import string
from time import time 
import math as m

import torch
import torch.nn as nn
from tqdm import tqdm
from torchsummary import summary

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# will come into use for nltk word_tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### ***TEXT PREPOCESSSING***
- Figuring out all the messages by me only.
- Remove all the emojis.
- Remove all the links.
- Convert all strings to lower.
- Remove all the numbers.
- Tokenize the sentences.

In [2]:
# stack overflow zindabaad
# to remove all the emojis from a string 
def deEmojify(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [3]:
def preprocessing(string):
    string = string.lower()
    # i have to remove links in the message if any
    string = re.sub(r'https\S+', '', string)  # it will remove all of the link as we know there is no whitespace in the link
    # i have to remove emojis if any 
    string = deEmojify(string)
    # i have to remove numerics if any
    string = re.sub(r'[0-9]', '', string)
    # to remove spaces at the start and end of the sentences
    string = string.strip() 
    # now we must toeknize sentence
    return nltk.word_tokenize(string)

In [4]:
# dummy code
def PREPROCESSED_DATA(data):
    data = data.split('\n')
    #data = [data]  # wapas hatana hai
    # keval mere likhne ka pattern hi check karna hai mujhe isliye main keval meri chats hi utha raha hoon, although chats realted
    # hain but main maan kar chal raha hoon ki chats independent hain (matlab pichle message par depend karta hai ki aage kya likhne waala hoon)    
    required_messages = []
    for s in data:
        idx = s.find('-')
        temp = s[idx+1:]
        #print(f"s = {s}\ntemp = {temp}")
        idx = temp.find(':')
        #print(f"temp = {temp}\n\n\n")
        if idx!=-1 and not temp.endswith('>'):  # considering my messages only and ignoring the messages in which media were sent
            temp = temp[idx+1:]
            required_messages.append(temp)  # +14 isliye taaki naam aur ': ' dono avoid ho jaaye aur keval sentence hi mile
        # now that data is ready i have to preprocess it
    
    preprocessed_data = []
    # now we have to preprocess each message
    for message in required_messages:
        temp = preprocessing(message)
        if len(temp)>0:
            preprocessed_data.append(temp)
    return preprocessed_data

In [None]:
# isme keval khud ki chats hi daali hain (as told by binod)
def PREPROCESSED_DATA(data):
    data = data.split('\n')
    # keval mere likhne ka pattern hi check karna hai mujhe isliye main keval meri chats hi utha raha hoon, although chats realted
    # hain but main maan kar chal raha hoon ki chats independent hain (matlab pichle message par depend karta hai ki aage kya likhne waala hoon)    
    required_messages = []
    for s in data:
        temp = s.find('Binod')
        if temp!=-1 and not s.endswith('>'):  # considering my messages only and ignoring the messages in which media were sent
            required_messages.append(s[temp + 7:])  # +14 isliye taaki naam aur ': ' dono avoid ho jaaye aur keval sentence hi mile
        # now that data is ready i have to preprocess it
    
    preprocessed_data = []
    # now we have to preprocess each message
    for message in required_messages:
        temp = preprocessing(message)
        if len(temp)>0:
            preprocessed_data.append(temp)
    return preprocessed_data

In [5]:
path = "WhatsApp Chat with IIT BHU Mnc 3rd year.txt"
# now collecting the data and will try if it work with my way of talking 
with open(path, 'r', encoding='utf-8') as f:
    data = f.read()
preprocessed_data = PREPROCESSED_DATA(data)

In [6]:
for i in range(20):
    print(f'{i+1}: {preprocessed_data[i]}')

1: ['this', 'message', 'was', 'deleted']
2: ['st', 'msg']
3: ['welcome', 'gumys']
4: ['we', 'are', 'here', 'the', 'raise', 'the', 'voice', 'against', 'the', 'repression']
5: ['welcome', 'to', 'mnc']
6: ['++']
7: ['this', 'message', 'was', 'deleted']
8: ['++']
9: ['++']
10: ['this', 'message', 'was', 'deleted']
11: ['this', 'message', 'was', 'deleted']
12: ['this', 'message', 'was', 'deleted']
13: ['this', 'message', 'was', 'deleted']
14: ['this', 'message', 'was', 'deleted']
15: ['this', 'message', 'was', 'deleted']
16: ['smashing', 'nepotism']
17: ['smashing', 'racism']
18: ['smashing', 'misogyny']
19: ['smashing', 'autocracy']
20: ['smashing', 'masochism']


### ***Splitting the dataset***

In [7]:
# now that preprocessing is done, we have to split the text into training and test sets assuming that my text in independent of the order  
def splits(preprocesed_data, train_size=0.95):
    np.random.seed(0)  # taaki har baar ek hi split aaye
    np.random.shuffle(preprocesed_data)
    index = int(len(preprocesed_data)*train_size)
    train_data = preprocesed_data[:index]
    test_data = preprocesed_data[index:]
    return train_data, test_data

In [8]:
train_size = 1.0 # data kam hai abhi
train_data, test_data = splits(preprocessed_data, train_size=train_size)
print(len(train_data), len(test_data))

15001 0


### ***Vocab Building***
- We know that there can be many words in the testing phase which are not in vocabulary and we have to make our model robust of it, so we will add a minimum frequencey barrier in the vocabulary built by training data through which only words having higher frequency will be selected.
- Then we will replace words in preprocessed_data by '<unk>' which are not present in the closed vocabulary

In [9]:
# now that splits are done, we have to first form the vocabulary of words
# then set a frequencey of words under which no words will be in vocabulary
# replace our preprocessed_sets with <unk>
def build_vocab(preprocessed_data, min_freq=2):
    vocab = {}
    closed_vocab = {}
    for message in preprocessed_data:
        for word in message:
            if word in vocab.keys():
                vocab[word]+=1
            else:
                vocab[word] = 1
    # now we have to filter out the words in the vocabulary
    for word, count in vocab.items():
        if count>min_freq:
            closed_vocab[word] = count
    
    return vocab, closed_vocab

In [10]:
min_freq = 1 
vocab, closed_vocab = build_vocab(train_data, min_freq=min_freq)
# this is a not a good dataset as as almost half of the words are just appearing for 1 time just because of different forms
#(singular or plural, gender (acccha ya achchi))
len(vocab), len(closed_vocab)

(7181, 3453)

In [11]:
def unk_preprocessing(train_data, closed_vocab, unknown_token = '<unk>'):
    # have to do preprocessing again, replace many of the words by <unk>
    train_data_unk = []
    # first for training data
    for message in train_data:
        temp = []
        for word in message:
            if word in closed_vocab.keys():
                temp.append(word)
            else:
                temp.append(unknown_token)
        train_data_unk.append(temp)
    
    return train_data_unk

In [12]:
# clearly need much more data (itne saare unks hain kisi kisi statement main), par experiment ke liye abhi yahi dataset se
# aage badh raha hoon
train_data_unk = unk_preprocessing(train_data, closed_vocab)
test_data_unk = unk_preprocessing(test_data, closed_vocab)

### ***N-grams***
 - Here, we will have the markov assumption that the next word of the model will depend upon last n words words.
 - We will implement the sliding windows and build the n_gram_vocab where the key will be a tuple and its value will be number of counts of that phrase.
 - Then we have to implement a function for probability of a word occuring given the last n_gram (here we will be implementing a bigram model, i.e., the upcoming word will depend on the last two words).
 - We have to also consider k-smoothing so that, there is atleast some probability of every word occuring next.

In [13]:
def n_gram(train_data_unk, window_len=2):
    counts = {}
    for message in train_data_unk:
        temp =  ['<s>']*(window_len-1)  + message + ['<e>']
        for i in range(0, len(temp)-window_len+1):
            phrase = tuple(temp[i:i+window_len]) # as list is unhashable type
            if phrase in counts.keys():
                counts[phrase]+=1
            else:
                counts[phrase] = 1
    return counts

In [14]:
n = 2  # we are working with biagrams, i.e., the next word will be predicted just on the just the previous word only 
n_gram_counts = n_gram(train_data_unk, window_len=n-1)
n_plus_one_gram_counts = n_gram(train_data_unk, window_len=n)
print(f"{len(n_gram_counts)}, {len(n_plus_one_gram_counts)}")

3455, 29878


In [15]:
def calc_prob(previous_n_gram, word, n_gram_counts=n_gram_counts, 
               n_plus_one_gram_counts = n_plus_one_gram_counts, k = 1.0, vocab_size=len(closed_vocab)): 
    prev = tuple(previous_n_gram)
    c1 = n_gram_counts[prev] if prev in n_gram_counts.keys() else 0
    prev = prev + (word, )
    c2 = n_plus_one_gram_counts[prev] if prev in n_plus_one_gram_counts.keys() else 0
    prob = (c2 + k)/(c1 + k*vocab_size)
    return prob

In [16]:
# "mujhe pata" is more likely than "mujhe main", let us see if it is reflected in the same
a = calc_prob(['mujhe'], 'pata')
b = calc_prob(['mujhe'], 'main')
c = calc_prob(['mujhe'], 'nahin')
print(f"Probability of phrase a occuring is {a:.6f}\nProbability of phrase b occuring is {b:.6f}\nProbability of phrase c occuring os {c:.6f}")

Probability of phrase a occuring is 0.000850
Probability of phrase b occuring is 0.000283
Probability of phrase c occuring os 0.000283


In [17]:
#  now we have to find the most probable word for a previous_n_gram_token
def max_prob(previous_n_gram, 
             vocab = closed_vocab, n_gram_counts=n_gram_counts, n_plus_one_gram_counts=n_plus_one_gram_counts,
             vocab_size=len(closed_vocab), k=1.0, best_n_words = 5):
    best_words = {}
    all_words = list(vocab.keys()) + ['<unk>', '<e>']
    probs = [calc_prob(previous_n_gram, word) for word in all_words] 
    indices = np.argsort(probs)
    indices = indices[::-1]
    # best n words
    for i in range(best_n_words):
        best_words[all_words[indices[i]]] = probs[indices[i]]
    
    return best_words

In [18]:
best_words = max_prob(['mujhe'])
best_words

{'bhi': 0.0031179138321995466,
 '<e>': 0.002551020408163265,
 'to': 0.0022675736961451248,
 'toh': 0.0017006802721088435,
 'laga': 0.0011337868480725624}

In [19]:
def predicted_sentence(initial_line, n = 2, length = 5):  # length means that we have to extend the intial sequence upto this more words
    # preprocessing the statement first
    prep_line = preprocessing(initial_line) # will return list of tokens
    prep_line = [prep_line] # have to do this as the structure of unk_preprocessing demands this
    prep_line = unk_preprocessing(prep_line, closed_vocab)
    prep_line = prep_line[0]
    prep_line = ['<s>']*(n-1) + prep_line # not appending the <e> token
   # print(f"prep_line = {prep_line}\n")
    
    # now i have to apply the n-gram predictive model
    prev_n_gram = tuple(prep_line[-n+1:])  # picking up the last n-1 words 
    # as we know that in python our dictionary remains in the order the keys were inserted, so dictionary is already sorted
    temp = initial_line
    for i in range(length):
        best_words = max_prob(prev_n_gram, best_n_words=3)
        #print(f"prev_n_gram = {prev_n_gram}\nbest_words = {best_words}\n\n\n")
        for word, prob in best_words.items():
            if i==length-1:  # it is the last word, then we can end out at anyth
                temp+= " "  + word
                prev_n_gram = prev_n_gram[1:] + (word, )
                break
            elif word!='<unk>' and word!='<e>':
                #print("word = ", word)
                temp+= " " + word
                prev_n_gram = prev_n_gram[1:] + (word, )
                break
                
    return temp   


In [36]:
closed_vocab['lawde']

13

In [55]:
predicted_sentence("bsdk", length=6)

'bsdk tune bataya nhi hai ? <e>'

### ***Log Perplexity Score***
- It is a metric which have a meaning that how likely is the following sentence to occur.
- Lower the value of the metric implies that great our model is doing.


In [56]:
def log_perplexity(string, n=2):
    prep_line = preprocessing(string) # will return list of tokens
    prep_line = [prep_line] # have to do this as the structure of unk_preprocessing demands this
    prep_line = unk_preprocessing(prep_line, closed_vocab)
    prep_line = prep_line[0]
    prep_line = ['<s>']*(n-1) + prep_line # not appending the <e> token
    
    score = 0
    for i in range(0, len(prep_line)-n+1):
        score+= m.log(calc_prob(prep_line[i:i+n-1], prep_line[i+n-1]))
    score*= (-1/len(prep_line))
    return score

In [57]:
# the second sentence should have the higher perplexity 
print(log_perplexity('yr ye song sun na'))
print(log_perplexity('mujhe main sun song na'))

6.389004167882511
6.2422945639222185
