From Scracth Implementation 

In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
import random

In [2]:
data_cwe = []
data_sw = []

with open("cwe-test.txt","r") as fileobj:
    for line in fileobj:  
       for i in line: 
           data_cwe+=i

with open("sw-test.txt","r") as fileobj:
    for line in fileobj:  
       for i in line: 
           data_sw+=i

In [3]:
def split_list(a_list):
    half = len(a_list)//2
    return a_list[:half], a_list[half:]

In [4]:
train_cwe, test_cwe = split_list(data_cwe)
train_sw, test_sw = split_list(data_sw)

In [5]:
print("length of training sets", len(train_cwe), len(train_sw))
print("length of test sets", len(test_cwe), len(test_sw))

length of training sets 30858 1725691
length of test sets 30859 1725692


In [6]:
import numpy as np
characters_cwe = np.append(np.unique(data_cwe),"<UNK>")
characters_sw = np.append(np.unique(data_sw),"<UNK>")

In [7]:
print(len(characters_cwe))
print(len(characters_sw))

33
50


In [8]:
def make_markov_model(data, n_gram):
    markov_model = {}
    for i in range(len(data)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += data[i+j] + ""
            next_state += data[i+j]+ data[j+n_gram] + ""
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [9]:
train_bigram_model_cwe = make_markov_model(train_cwe,n_gram=2)
print("number of states in bigram model for CWE = ", len(train_bigram_model_cwe.keys()))

train_bigram_model_sw = make_markov_model(train_sw,n_gram=2)
print("number of states in bigram model for SW = ", len(train_bigram_model_sw.keys()))

number of states in bigram model for CWE =  31
number of states in bigram model for SW =  48


In [10]:
train_trigram_model_cwe = make_markov_model(train_cwe,n_gram=3)
print("number of states in trigram model for CWE = ", len(train_trigram_model_cwe.keys()))

train_trigram_model_sw = make_markov_model(train_sw,n_gram=3)
print("number of states in trigram model for SW = ", len(train_trigram_model_sw.keys()))

number of states in trigram model for CWE =  321
number of states in trigram model for SW =  1638


Probability function

In [11]:
def prob(model, word, phrase):
    if phrase in model[word]:
        return model[word][phrase]
    else:
        return 0

In [12]:
print(prob(train_bigram_model_cwe, 'y', 'yno'))
print(prob(train_bigram_model_sw, 'y', 'yia'))
print(prob(train_trigram_model_cwe, 'yu', 'ygu'))
print(prob(train_trigram_model_sw, 'yu', 'y uil'))

0.21888888888888888
0.5784071199067361
0
0.10305343511450382


In [18]:
train_bigram_model_cwe['s']['sna']

0.27358490566037735

Prediction function

In [13]:
def get_prediction(model, phrase):
    return max(model[phrase], key=model[phrase].get)

In [14]:
print(get_prediction(train_bigram_model_cwe, 's'))
print(get_prediction(train_bigram_model_sw, 's'))
print(get_prediction(train_trigram_model_cwe, 'yu'))
print(get_prediction(train_trigram_model_sw, 'yu'))

sna
sih
ygu' 
y uim


Testing data

In [15]:
test_bigram_model_cwe = make_markov_model(train_cwe,n_gram=2)
print("number of states in bigram model for CWE = ", len(test_bigram_model_cwe.keys()))

test_bigram_model_sw = make_markov_model(train_sw,n_gram=2)
print("number of states in bigram model for SW = ", len(test_bigram_model_sw.keys()))

number of states in bigram model for CWE =  31
number of states in bigram model for SW =  48


In [16]:
test_trigram_model_cwe = make_markov_model(train_cwe,n_gram=3)
print("number of states in bigram model for CWE = ", len(test_trigram_model_cwe.keys()))

test_trigram_model_sw = make_markov_model(train_sw,n_gram=3)
print("number of states in bigram model for SW = ", len(test_trigram_model_sw.keys()))

number of states in bigram model for CWE =  321
number of states in bigram model for SW =  1638


Cross entropy

In [40]:
def get_entropy(model,word):
    from math import log2
    entropy = 0
    entropy = entropy -  (1/(len(model[word]))) * log2(model[word][max(model[word], key=model[word].get)])
    return entropy

In [47]:
for a in test_bigram_model_cwe:
  print("entropy for character ", a , " is ", get_entropy(test_bigram_model_cwe,a))

entropy for character  s  is  0.2671342084908039
entropy for character  a  is  0.04288441804326887
entropy for character  n  is  0.16545138973796358
entropy for character  g  is  0.2692457278453316
entropy for character  '  is  0.0444541286458747
entropy for character  h  is  0.2931653167493675
entropy for character  i  is  0.08823618128957335
entropy for character  l  is  0.2832248605361031
entropy for character  e  is  0.07099449489127796
entropy for character     is  0.10562381930888498
entropy for character  v  is  0.10317839250690342
entropy for character  o  is  0.06772834919640801
entropy for character  c  is  0.0
entropy for character  z  is  0.24250892707602018
entropy for character  m  is  0.07763665294583462
entropy for character  w  is  0.17316117654997756
entropy for character  y  is  0.15463745145599614
entropy for character  ,  is  0.004069685898405805
entropy for character  k  is  0.2787251168589978
entropy for character  u  is  0.08891455869433039
entropy for character

In [48]:
for a in test_bigram_model_sw:
  print("entropy for character ", a , " is ", get_entropy(test_bigram_model_sw,a))

entropy for character  h  is  0.029120282094166113
entropy for character  i  is  0.04588161833413501
entropy for character     is  0.05460868785062529
entropy for character  l  is  0.02852534816652356
entropy for character  m  is  0.040824554138834145
entropy for character  a  is  0.029372821333132734
entropy for character  n  is  0.038446934728809695
entropy for character  s  is  0.048523040452652745
entropy for character  k  is  0.04426551494009477
entropy for character  w  is  0.010067149041992286
entropy for character  b  is  0.03441877306951064
entropy for character  u  is  0.062356643632097745
entropy for character  r  is  0.04267572554922145
entropy for character  t  is  0.04328691118032263
entropy for character  e  is  0.04856397733983977
entropy for character  f  is  0.04583938368215769
entropy for character  c  is  0.012486761415604857
entropy for character  y  is  0.022566936674963258
entropy for character  o  is  0.03485702316505044
entropy for character  ,  is  0.004055651

In [49]:
for a in test_trigram_model_cwe:
  print("entropy for character ", a , " is ", get_entropy(test_trigram_model_cwe,a))

entropy for character  sa  is  0.09968416183086115
entropy for character  an  is  0.19772799234999167
entropy for character  ng  is  0.24123423920116974
entropy for character  g'  is  0.03874366499408522
entropy for character  'h  is  0.1854975000431264
entropy for character  ha  is  0.10187373527093316
entropy for character  ni  is  0.07654866571518548
entropy for character  il  is  0.25578154903998873
entropy for character  le  is  0.058656917657945354
entropy for character  en  is  0.19492842607675231
entropy for character  i   is  0.13750108734958263
entropy for character   v  is  0.08945315764647344
entropy for character  vi  is  0.08473956419917537
entropy for character  in  is  0.16011028549208034
entropy for character  no  is  0.07934129218707668
entropy for character  og  is  0.29425579106049615
entropy for character  gi  is  0.10659533505585048
entropy for character  e   is  0.12618160214624402
entropy for character   c  is  0.0
entropy for character  ch  is  0.12109638649239

In [50]:
for a in test_trigram_model_sw:
  print("entropy for character ", a , " is ", get_entropy(test_trigram_model_sw,a))

entropy for character  hi  is  0.05929577185142063
entropy for character  ii  is  0.022359765140621515
entropy for character  i   is  0.05806227814617977
entropy for character   i  is  0.05039287917340695
entropy for character  il  is  0.025383751956993057
entropy for character  li  is  0.05382755740733817
entropy for character  im  is  0.07003730720146484
entropy for character  ma  is  0.047447132767763064
entropy for character  aa  is  0.05923654928867682
entropy for character  an  is  0.05180541796292349
entropy for character  ni  is  0.018489572784543076
entropy for character  is  is  0.03428720359649864
entropy for character  sh  is  0.037778623789862895
entropy for character  ha  is  0.05616259056917049
entropy for character  a   is  0.050114191962309255
entropy for character   k  is  0.04929383804624905
entropy for character  kw  is  0.03008602939149964
entropy for character  wa  is  0.02589821569950909
entropy for character  am  is  0.05521989911025653
entropy for character  mb