In [19]:
import json
import pickle
import numpy as np
import pandas as pd
import os
from multiprocessing import Process
import kenlm

# KenLM Baseline

## Prepare Data For KenLM

In [4]:
path550 = "/data/rali7/Tmp/solimanz/data/datasets/top550/"
path7k = "/data/rali7/Tmp/solimanz/data/datasets/reduced7000/"

# Load data dicts
with open(os.path.join(path550, "jobid", "data.json"), "r") as f:
    data550 = json.load(f)
with open(os.path.join(path7k, "jobid", "data.json"), "r") as f:
    data7k = json.load(f)

In [5]:
def prepare_KenLM(train_data, dataset):
    train_txt = "\n".join([" ".join([str(i) for i in dat[1]]) for dat in train_data])
    
    with open(f"/data/rali7/Tmp/solimanz/data/datasets/ngrams/train_{dataset}.txt", "w") as f:
        f.write(train_txt)

In [6]:
#prepare_KenLM(data550['train_data'], '550')
prepare_KenLM(data7k['train_data'], '7k')

In [12]:
n_7k = len(data7k['title_to_id'])
n_550 = len(data550['title_to_id'])

In [14]:
def kenlm_predict(test, n_labels, n=2, model_binary='bigram550'):    
    model = kenlm.Model(f"../../kenlm/build/{model_binary}.binary")
    model_preds = np.zeros((len(test), n_labels))
    
    for j, seq in enumerate(test):
        test_seq = seq[-(n-1):]
        for i in range(n_labels):
            model_preds[j][i] = model.score(" ".join(test_seq + [str(i)]))      

    return (-model_preds).argsort(axis=1)[:, :10]

In [15]:
def worker(test_data, n_labels, n, model):
    preds = kenlm_predict(test_data, n_labels, n, model)
    #filename = model.split('.')[0]
    np.save(f'/data/rali7/Tmp/solimanz/data/ngram_preds/{model}.npy', preds)

In [16]:
def run_KenLM_multi(models, data, n_labels):
    test_data = [[str(i) for i in dat[1][:-1]] for dat in data["test_data"]]
    jobs = []
    for n, model in models:
        p = Process(target=worker, args=(test_data, n_labels, n, model))
        jobs.append(p)
        p.start()

In [17]:
models_550 = [(2, 'bigram550'),
              (3, 'trigram550'),
              (4, '4gram550'),
              (5, '5gram550')]

models_7k = [(2, 'bigram7k'),
             (3, 'trigram7k'),
             (4, '4gram7k'),
             (5, '5gram7k')]

In [None]:
run_KenLM_multi(models_550, data550, n_550)

In [20]:
run_KenLM_multi(models_7k, data7k, n_7k)

### Save Models

In [None]:
res550 = dict()
res7k = dict()

In [None]:
with open('/data/rali7/Tmp/solimanz/data/pickles/res550.pkl', 'wb') as f:
    pickle.dump(file=f, obj=res550)

In [None]:
with open('/data/rali7/Tmp/solimanz/data/pickles/res7k.pkl', 'wb') as f:
    pickle.dump(file=f, obj=res7k)