In [1]:
import torch
import torch.nn.functional as F
import heapq
import torchtext
import scipy
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from collections import Counter
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 4)

import time
import random

from device import dev
from path import Path, Log_Path
from dataset import dataset
from model import RNN
from preprocess import get_transitions, add_transitions

from util import get_matrices
from util import blank_filling, identical_filling, empirical_filling, weighted_filling, near_filling,uniform_filling
from util import none_regularization, linear_regularization, strong_linear_regularization

from synonym import get_synonym

from evaluation import evaluation

if __name__ == '__main__':

    # select the dataset. options: 'news', 'toxic'
    DATASET = 'news'

    # select the clusters number
    CLUSTER = 40

    # select the completion and regularization tactics
    COMPLETION = [weighted_filling]
    REGULARIZATION = [linear_regularization]

    # select the iteration times of using synonym to augmenting dataset
    NUM_EPOCHS = 2
    REPLACE_RATE = 0.4
    DROPOUT = 0.2

    start_time = time.time()
    # load model and dataset
    train_dataset = dataset(DATASET, True)
    test_dataset = dataset(DATASET, False)
    model = torch.load(Path+DATASET+'_model.pth')
    model.eval()
    vocab_num = len(train_dataset.vocab)
    state_num = CLUSTER + 1
    print(f'vocab: {vocab_num}')
    print(f'data number: {len(train_dataset.int_data)}')
    print(f'Model and dataset ready. Use time:{time.time()-start_time:.1f}')

    current_time = time.time()
    # get rnn prediction in test set
    rnn_prediction_container = []
    for idx, data in enumerate(test_dataset.int_data):
        # remove 0 at the end
        while len(data) > 1 and data[-1] == 0:
            data = data[0:len(data)-1]
        data = data.reshape(-1, 1)

        model.clear_output_sequence()
        _ = model(data)
        runtime_predict = model.runtime_predict()
        runtime_data = []
        for step_data in runtime_predict:
            step_data = step_data.flatten().detach()
            runtime_prediction = F.softmax(step_data,dim=0)
            runtime_data.append(runtime_prediction.reshape(1, -1))
        runtime_data = torch.concat(runtime_data, dim=0)
        rnn_prediction = torch.argmax(runtime_data[-1])
        rnn_prediction_container.append(rnn_prediction)

    transition_count, kmeans, state_weightes, all_prediction_container = get_transitions(model, train_dataset, CLUSTER)
    print(f'Transitions ready. Use time:{time.time()-current_time:.1f}')

    # generate state distance
    state_distance = torch.zeros((state_num, state_num),device=dev())
    for p in range(state_num):
        for q in range(state_num):
            diff = state_weightes[p] - state_weightes[q]
            state_distance[p, q] = (diff * diff).sum()
    state_distance = torch.exp(state_distance)
    state_distance = 1 / state_distance
    

    result = np.zeros((len(COMPLETION), len(REGULARIZATION)))
    completion_names = [c.__name__ for c in COMPLETION]
    regularization_names = [r.__name__ for r in REGULARIZATION]
    for i, completion in enumerate(COMPLETION):
        for j, regularization in enumerate(REGULARIZATION):
            current_time = time.time()

            transition_matrices = get_matrices(transition_count, state_distance, completion, regularization)
            correct_rate = evaluation(test_dataset, transition_matrices, state_weightes, rnn_prediction_container)
            result[i,j] = round(correct_rate*100, 2)
            print(f'{completion.__name__} & {regularization.__name__} : {round(correct_rate*100, 2)}%, {time.time() - current_time:.0f}s')
            
    result = pd.DataFrame(result, columns=regularization_names, index=completion_names)
    print(result)
    print(f'Evaluation done.')
    
    all_synonym = torch.load(Path+DATASET+'_synonym.pth')
    '''
    all_synonym is a tensor with size (vocab_num, m),
    where m is the number of synonym for each word.
    The [i,j]-th item of all_synonym indicates the j-th synonym of i-th word.
    If some word doe NOT have synonym, the i-th row will be filled with -1.
    '''
    current_time = time.time()
    for epoch in range(NUM_EPOCHS):
        current_time = time.time()
        all_data = []
        for id, data in enumerate(train_dataset.int_data):
            # remove 0 at the end
            while len(data) > 1 and data[-1] == 0:
                data = data[0:len(data)-1]
            #ori_data = data.clone()
            for idx, word in enumerate(data):
                if random.random() < REPLACE_RATE and word < (vocab_num/5):
                    i = random.randint(1, 4)
                    if all_synonym[word, 0].item() != -1:
                        data[idx] = all_synonym[word, i].item()
                elif random.random() < DROPOUT:
                    data[idx] = 0
            all_data.append(data)
        transition_count = add_transitions(model,all_data,transition_count,kmeans)
        print(f'new transition count ready. Use time:{time.time()-current_time:.1f}')
        if (epoch+1) % 1 == 0:
            result = np.zeros((len(COMPLETION), len(REGULARIZATION)))
            for i, completion in enumerate(COMPLETION):
                for j, regularization in enumerate(REGULARIZATION):
                    current_time = time.time()

                    transition_matrices = get_matrices(transition_count, state_distance, completion, regularization)
                    correct_rate = evaluation(test_dataset, transition_matrices, state_weightes, rnn_prediction_container)
                    result[i,j] = round(correct_rate*100, 2)  
                    print(f'{completion.__name__} & {regularization.__name__} : {round(correct_rate*100, 2)}%, {time.time() - current_time:.0f}s')

            result = pd.DataFrame(result, columns=regularization_names, index=completion_names)
            print('-'*100)          
            print(f'epoch {epoch+1}: ')
            print(result)
            current_time = time.time()
    
    print(f'Workflow done. Use time:{time.time()-start_time:.1f}')

vocab: 20321
data number: 22808
Model and dataset ready. Use time:6.8
Transitions ready. Use time:164.4
weighted_filling & linear_regularization : 81.68%, 280s
                  linear_regularization
weighted_filling                  81.68
Evaluation done.
new transition count ready. Use time:115.6
weighted_filling & linear_regularization : 82.08%, 277s
----------------------------------------------------------------------------------------------------
epoch 1: 
                  linear_regularization
weighted_filling                  82.08
new transition count ready. Use time:113.2
weighted_filling & linear_regularization : 82.13%, 253s
----------------------------------------------------------------------------------------------------
epoch 2: 
                  linear_regularization
weighted_filling                  82.13
Workflow done. Use time:1210.4


In [3]:
kmeans_prediction = kmeans.predict(all_prediction_container)
frequencies = Counter(kmeans_prediction)
total = sum(frequencies.values())
for key in frequencies:
    frequencies[key] = frequencies[key] / total 
weight_km = torch.tensor(list(dict(sorted(frequencies.items())).values())) # Frequency of each cluster

In [12]:
#Utils for part I
def word2index(word): # input word in str format. return index.
    return torchtext.vocab.Vocab.get_stoi(train_dataset)[word]
    
def index2word(index): # reverse the operation above.
    return torchtext.vocab.Vocab.get_itos(train_dataset)[index]
    
def influence(word): # input an index
    mat_word = torch.clone(transition_matrices[word][1:])
    state_w = torch.clone(state_weightes[1:])
    weight_km_c = torch.clone(weight_km).to('cpu')
    mat_word_c = torch.clone(mat_word).to('cpu')
    out_state = torch.matmul(weight_km_c,mat_word_c)[1:]
    for i in range (len(state_w)):
        state_w[i] = state_w[i] * ( out_state[i] - weight_km_c[i] ) 
    return torch.sum(state_w,dim = 0) # Influence vector.

def topk_influence(goal_class = 0,k = 10): # Return top_k influence word list on goal_class.
    inf_state = []
    for i in range(len(transition_matrices)):
        inf_state.append(influence(i)[goal_class])
    topk_lst = heapq.nlargest(k, range(len(inf_state)), inf_state.__getitem__)
    for i in range(len(topk_lst)):
        topk_lst[i] = index2word(topk_lst[i])
    return topk_lst

# Test:
print(topk_influence())

['players', 'basketball', 'celtics', 'knicks', 'bruins', 'lakers', 'yankees', 'soccer', 'coach', 'mets']


In [30]:
#Utils for part II
def diff_word(w1,w2,mode = '2-norm'): # mode chosen from 'kl' and '2-norm'
    w1_m = torch.clone(transition_matrices[w1])
    w2_m = torch.clone(transition_matrices[w2])
    if mode == '2-norm':
        return torch.sum((w1_m - w2_m)**2)
    elif mode == 'kl':
        w1_m = w1_m.to('cpu')[1:]
        w2_m = w2_m.to('cpu')[1:]
        unif_s = torch.clone(weight_km).to('cpu')
        sD_1 = torch.matmul(unif_s,w1_m)[1:]
        sD_2 = torch.matmul(unif_s,w2_m)[1:]
        state_w1 = torch.clone(state_weightes[1:])
        state_w2 = torch.clone(state_weightes[1:])
        for i in range (len(state_w1)):
            state_w1[i] = state_w1[i] * sD_1[i]
            state_w2[i] = state_w2[i] * sD_2[i]
        P_1 = torch.sum(state_w1,dim = 0)
        P_2 = torch.sum(state_w2,dim = 0)
        return scipy.stats.entropy(P_1.cpu(),P_2.cpu())
    
def get_nearest(word,mode = '2-norm'):
    goal_w = word2index(word)
    idx = 0
    dist = 1e6
    for word in range(2000):
        if (diff_word(goal_w,word,mode) < dist) and (word != goal_w):
            idx = word
            dist = diff_word(goal_w,word,mode)
    return index2word(idx)

# Test:
print(get_nearest('basketball','kl'))

soccer


In [64]:
# word2vec process
from sklearn.decomposition import PCA
def word2vec(tensor_in):
    tensor_lst = torch.clone(tensor_in).to('cpu')
    new_lst = []
    for i in range(len(tensor_lst)):
        new_lst.append(torch.flatten(tensor_lst[i]).numpy())
    pca = PCA(n_components=2)
    tensor_reduced = pca.fit_transform(new_lst)
    return torch.from_numpy(tensor_reduced)

def diff_word_w2v(w1,w2,mode = '2-norm'): # mode chosen from 'kl' and '2-norm'
    w1_m = torch.clone(w2v[w1])
    w2_m = torch.clone(w2v[w2])
    if mode == '2-norm':
        return torch.sum((w1_m - w2_m)**2)

    
def get_nearest_w2v(word,mode = '2-norm'):
    goal_w = word2index(word)
    idx = 0
    dist = 1e6
    for word in range(2000):
        if (diff_word_(goal_w,word,mode) < dist) and (word != goal_w):
            idx = word
            dist = diff_word_(goal_w,word,mode)
    return index2word(idx)

# Test:
# w2v = word2vec(transition_matrices)
# print(get_nearest_w2v('nfl'))

yankees
