From Scracth Implementation 

In [36]:
import numpy as np
import pandas as pd
import os
import re
import string
import random

In [37]:
data_cwe = []
data_sw = []

with open("cwe-test.txt","r") as fileobj:
    for line in fileobj:  
       for i in line: 
           data_cwe+=i

with open("sw-test.txt","r") as fileobj:
    for line in fileobj:  
       for i in line: 
           data_sw+=i

In [38]:
def split_list(a_list):
    half = len(a_list)//2
    return a_list[:half], a_list[half:]

In [39]:
train_cwe, test_cwe = split_list(data_cwe)
train_sw, test_sw = split_list(data_sw)

In [40]:
print("length of training sets", len(train_cwe), len(train_sw))
print("length of test sets", len(test_cwe), len(test_sw))

length of training sets 30858 1725691
length of test sets 30859 1725692


In [41]:
import numpy as np
characters_cwe = np.append(np.unique(data_cwe),"<UNK>")
characters_sw = np.append(np.unique(data_sw),"<UNK>")

In [42]:
print(len(characters_cwe))
print(len(characters_sw))

33
50


In [43]:
def make_markov_model(data, n_gram):
    markov_model = {}
    for i in range(len(data)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += data[i+j] + ""
            next_state += data[i+j]+ data[j+n_gram] + ""
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [44]:
train_bigram_model_cwe = make_markov_model(train_cwe,n_gram=2)
print("number of states in bigram model for CWE = ", len(train_bigram_model_cwe.keys()))

train_bigram_model_sw = make_markov_model(train_sw,n_gram=2)
print("number of states in bigram model for SW = ", len(train_bigram_model_sw.keys()))

number of states in bigram model for CWE =  31
number of states in bigram model for SW =  48


In [45]:
train_trigram_model_cwe = make_markov_model(train_cwe,n_gram=3)
print("number of states in trigram model for CWE = ", len(train_trigram_model_cwe.keys()))

train_trigram_model_sw = make_markov_model(train_sw,n_gram=3)
print("number of states in trigram model for SW = ", len(train_trigram_model_sw.keys()))

number of states in trigram model for CWE =  321
number of states in trigram model for SW =  1638


Probability function

In [46]:
def prob(model, word, phrase):
    if phrase in model[word]:
        return model[word][phrase]
    else:
        return 0

In [47]:
print(prob(train_bigram_model_cwe, 'y', 'yno'))
print(prob(train_bigram_model_sw, 'y', 'yia'))
print(prob(train_trigram_model_cwe, 'yu', 'ygu'))
print(prob(train_trigram_model_sw, 'yu', 'y uil'))

0.21888888888888888
0.5784071199067361
0
0.10305343511450382


In [48]:
train_bigram_model_cwe['s']['sna']

0.27358490566037735

Prediction function

In [49]:
def get_prediction(model, phrase):
    return max(model[phrase], key=model[phrase].get)

In [50]:
print(get_prediction(train_bigram_model_cwe, 's'))
print(get_prediction(train_bigram_model_sw, 's'))
print(get_prediction(train_trigram_model_cwe, 'yu'))
print(get_prediction(train_trigram_model_sw, 'yu'))

sna
sih
ygu' 
y uim


Testing data

In [51]:
test_bigram_model_cwe = make_markov_model(train_cwe,n_gram=2)
print("number of states in bigram model for CWE = ", len(test_bigram_model_cwe.keys()))

test_bigram_model_sw = make_markov_model(train_sw,n_gram=2)
print("number of states in bigram model for SW = ", len(test_bigram_model_sw.keys()))

number of states in bigram model for CWE =  31
number of states in bigram model for SW =  48


In [52]:
test_trigram_model_cwe = make_markov_model(train_cwe,n_gram=3)
print("number of states in bigram model for CWE = ", len(test_trigram_model_cwe.keys()))

test_trigram_model_sw = make_markov_model(train_sw,n_gram=3)
print("number of states in bigram model for SW = ", len(test_trigram_model_sw.keys()))

number of states in bigram model for CWE =  321
number of states in bigram model for SW =  1638


Cross entropy

In [78]:
from math import log2

def get_entropy(model):
    entropy = 0
    for a in model:
        entropy = entropy -  (1/(len(model[a]))) * log2(model[a][max(model[a], key=model[a].get)])
    return entropy



In [79]:
get_entropy(test_bigram_model_cwe)

5.2302390076925365

In [80]:
get_entropy(test_bigram_model_sw)

2.085781017037768

In [81]:
get_entropy(test_trigram_model_cwe)

50.62268218907994

In [82]:
get_entropy(test_trigram_model_sw)

233.7896078707257