In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('./preprocessing')
sys.path.append('./seq2seq')

In [4]:
from processor import Code_Intent_Pairs
from data import get_train_loader, get_test_loader

### Load Data

In [5]:
code_intent_pair = Code_Intent_Pairs()

In [6]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [7]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)

In [8]:
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [9]:
class TrainSet():
    def __init__(self, code_intent_pair):
        self.code_intent_pair = code_intent_pair

    def __len__(self):
        return len(self.code_intent_pair)

    def __getitem__(self, idx):
        intent_idx = self.code_intent_pair[idx]['intent_indx']
        code_idx = self.code_intent_pair[idx]['code_indx_nocopy']
        return (intent_idx, code_idx)

In [10]:
trainset = TrainSet(train_entries)

### Get PMI

In [11]:
import math

In [12]:
pair_p = [[0] * code_size for i in range(word_size)]
word_p = [0] * word_size
code_p = [0] * code_size
unit_p = 1.0 / len(trainset)

In [13]:
for i, (intent, code) in enumerate(trainset):
    intent = set(intent)
    code = set(code)
    for word in intent:
        word_p[word] += unit_p
    for token in code:
        code_p[token] += unit_p
        
    for word in intent:
        for token in code:
            pair_p[word][token] += unit_p

In [62]:
pmi = [[0] * code_size for i in range(word_size)]
for i in range(word_size - 3):
    for j in range(code_size - 3):
        pmi[i][j] = math.log((pair_p[i][j] + 0.01 * unit_p) / (word_p[i] * code_p[j]))
        if pmi[i][j] < 1.5:
            pmi[i][j] = 0.0

### Test

In [63]:
word2num = code_intent_pair.word2num
code2num = code_intent_pair.code2num

In [64]:
pmi[word2num['numpy']][code2num['np']]

2.86213230714977

In [65]:
pmi[word2num['bash']][code2num['subprocess']]

3.5978404967809947

In [66]:
pmi[word2num['numpy']][code2num['os']]

0.0

In [67]:
pmi[word2num['url']][code2num['pd']]

0.0

### Store Results

In [68]:
import pickle
with open('pmi_matrix', 'wb') as f:
    pickle.dump(pmi, f)

In [1]:
with open('pmi_matrix', 'rb') as f:
    pmi = pickle.load(f)

NameError: name 'pickle' is not defined

In [213]:
new_pmi[word2num['url']][code2num['url']]

3.15892745460529