In [37]:
import os
import sys
import json
import random
import pickle
import argparse
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pprint import pprint

sys.path.append("../")

from utils.parse_arxiv import *
from keyword_extractor import *
from workloads.query_gen import *

In [6]:
save_path = None
k = 1
paper_num = 1
num_queries_per_paper = 10

file = open("../data/filtered_data.pickle", "rb")
data = pickle.load(file)
file.close()
data.reset_index(drop=True, inplace=True)

# sample rows from the data
samples = data.sample(paper_num)
# sample = data.sample(1).iloc[0]
samples

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,cat_freq,journal_freq,date_freq
110,709.184,Francoise Heres-Renzetti,"F. Schreuder, J.-G. Bij De Vaate",Localized Lna Cooling in Vacuum,Submitted on behalf of TIMA Editions\n (http:...,Dans Proceedings of 12th International Worksho...,,,cond-mat.mtrl-sci,,In the Square Kilometre Array (SKA) telescop...,"[{'version': 'v1', 'created': 'Wed, 12 Sep 200...",2007-09-13,"[[Schreuder, F., ], [De Vaate, J. -G. Bij, ]]",37716,37.0,104


In [9]:
sample_dict_list = []
for index, row in samples.iterrows():
    one_sample = dict(row)
    sample_dict_list.append(one_sample)

paper_id_list = []
queries_list = []
for d in sample_dict_list:
    query_template = QueryTemplate(infor_prob)
    query_template.parse_info(one_sample)
    queries_list.extend(query_template.generate_queries(num=num_queries_per_paper))
    paper_id_list.extend([d["id"]] * num_queries_per_paper)

20


In [10]:
for i, q in enumerate(queries_list):
    # print("{}; {}. : {}".format(paper_id_list[i], i+1, q))
    # format print "query_num. id: query" fix space
    print("{:8d}. {:10}: {}".format(i + 1, paper_id_list[i], q))

       1.    709.184: find papers written by < J.-G.BijDeVaate and F.Schreuder > about < 6 bondwires and optimal thermal isolation and whole system > published at < Dans Proceedings of 12th International Workshop on Thermal investigations of ICs - THERMINIC 2006, Nice : France (2006) >
       2.    709.184: find papers written by < J.-G.BijDeVaate and F.Schreuder > on < cond-mat.mtrl-sci > about < necessary power since and optimal thermal isolation and whole system and necessary active area >
       3.    709.184: find papers written by < F.Schreuder > on < cond-mat.mtrl-sci > about < 61 mw and needs 6 bondwires > published at < Dans Proceedings of 12th International Workshop on Thermal investigations of ICs - THERMINIC 2006, Nice : France (2006) >
       4.    709.184: find papers from year 2007 about < noise temperature and lna locally would significantly decrease > published at < Dans Proceedings of 12th International Workshop on Thermal investigations of ICs - THERMINIC 2006, Nice 

# Workloads
- different prob cfg
    - heavy infor from abstract vs light (0.3, 0.5, 1)
- different data coverage
    - covering papers (5%, 10%, 30%, 50%, 70%)

In [23]:
for p in [0.1, 0.3, 0.5, 1.0]:
    file_path = "cfgs/prob_{}.json".format(p)
    infor_prob = {
        "author": 0.5,
        "year": 0.5,
        "categories": 0.5,
        "keywords": p,
        "journal": 0.5,
    }

    # save prob to a json file
    with open(file_path, "w") as f:
        json.dump(infor_prob, f)

In [None]:
# print bash script lines
# example: python query_gen.py -pn 20 -n 2 -s workload.csv

import subprocess

save_root = "../data/workloads/"

# bash script header
print("#!/bin/bash")
for cv, num in zip([0.05, 0.1, 0.3, 0.5, 0.7], [20, 10, 5, 3, 2]):
    # print(len(data) * cv * num)
    for p in [0.1, 0.3, 0.5, 1.0]:
        cfg_file_path = "cfgs/prob_{}.json".format(p)
        save_csv_path = "cv{}_num{}_prob{}.csv".format(
            str(cv).replace(".", "_"), int(num), str(p).replace(".", "_")
        )
        full_save_csv_path = os.path.join(save_root, save_csv_path)
        out = subprocess.run(
            [
                "python query_gen.py -pn {} -n {} -s {} --prob {}".format(
                    int(len(data) * cv), num, full_save_csv_path, cfg_file_path
                )
            ],
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        print(out.stdout.decode("utf-8"))

#!/bin/bash
python query_gen.py -pn 646 -n 20 -s ../data/workloads/cv0_05_num20_prob0_1.csv --prob cfgs/prob_0.1.json
python query_gen.py -pn 646 -n 20 -s ../data/workloads/cv0_05_num20_prob0_3.csv --prob cfgs/prob_0.3.json
python query_gen.py -pn 646 -n 20 -s ../data/workloads/cv0_05_num20_prob0_5.csv --prob cfgs/prob_0.5.json
python query_gen.py -pn 646 -n 20 -s ../data/workloads/cv0_05_num20_prob1_0.csv --prob cfgs/prob_1.0.json
python query_gen.py -pn 1292 -n 10 -s ../data/workloads/cv0_1_num10_prob0_1.csv --prob cfgs/prob_0.1.json
python query_gen.py -pn 1292 -n 10 -s ../data/workloads/cv0_1_num10_prob0_3.csv --prob cfgs/prob_0.3.json
python query_gen.py -pn 1292 -n 10 -s ../data/workloads/cv0_1_num10_prob0_5.csv --prob cfgs/prob_0.5.json
python query_gen.py -pn 1292 -n 10 -s ../data/workloads/cv0_1_num10_prob1_0.csv --prob cfgs/prob_1.0.json
python query_gen.py -pn 3877 -n 5 -s ../data/workloads/cv0_3_num5_prob0_1.csv --prob cfgs/prob_0.1.json
python query_gen.py -pn 3877 -n 5 -s