In [29]:
import os
import json
import pandas as pd

from semantic_search import *

In [17]:
editable_data_dir = os.path.join('data', 'editable-data')
dataset_dir = os.path.join('data', 'dataset')

In [18]:
def make_clauses_dict(input_dir=editable_data_dir, output_dir=dataset_dir):
    with open(os.path.join(input_dir, 'clause-list.txt'), 'r') as f:
        formal_list_content = f.read()
    
    clause_id_dict = dict()
    for split in formal_list_content.strip().split('\n\n'):
        clause_id, clause = split.strip().split('\n')
        clause_id = int(clause_id.strip())
        clause = clause.strip()
        clause_id_dict[clause_id] = clause
        
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)

    with open(os.path.join(output_dir, 'clauses.json'), 'w') as f:
        json.dump(clause_id_dict, f)
    
    return clause_id_dict

In [19]:
def make_query_clause_dataset(clause_id_dict, input_dir=editable_data_dir, output_dir=dataset_dir):
    query_clause_dir = os.path.join(input_dir, 'query-clause')
    files = os.listdir(query_clause_dir)
    files.sort(key=lambda f : int(f[:-4]))
    query_clause_dicts = list()

    for file in files:
        with open(os.path.join(query_clause_dir, file), 'r') as f:
            query_clause_content = f.read().strip()

        valid_splits = list()
        for split in query_clause_content.split('\n'):
            split = split.strip()
            if split:
                valid_splits.append(split)

        clause_id = int(valid_splits[0])
        queries = valid_splits[2:]

        for i, query in enumerate(queries):
            query_clause_dict = dict()
            query_clause_dict['id'] = clause_id * 100 + i
            query_clause_dict['clause_id'] = clause_id
            query_clause_dict['query'] = query
            query_clause_dict['clause'] = clause_id_dict[clause_id]
            query_clause_dicts.append(query_clause_dict)
            
    with open(os.path.join(output_dir, 'query-to-clause.jsonl'), 'w') as f:
        for query_clause_dict in query_clause_dicts:
            query_clause_string = json.dumps(query_clause_dict)
            f.write(query_clause_string + '\n')
        
    return query_clause_dicts

In [20]:
def make_dataset(input_dir=editable_data_dir, output_dir=dataset_dir):
    clause_id_dict = make_clauses_dict(input_dir, output_dir)
    query_clause_dicts = make_query_clause_dataset(clause_id_dict, input_dir, output_dir)
    
    return clause_id_dict, query_clause_dicts

In [21]:
def load_clauses_dict(dataset_dir=dataset_dir):
    with open(os.path.join(dataset_dir, 'clauses.json'), 'r') as f:
        clauses_dict = json.load(f)
        
    return clauses_dict

In [22]:
def load_query_clause_dataset(dataset_dir=dataset_dir, as_pandas=True):
    with open(os.path.join(dataset_dir, 'query-to-clause.jsonl'), 'r') as f:
        query_clauses = pd.read_json(f, lines=True) if as_pandas else json.load(f)
    
    return query_clauses

In [23]:
def load_dataset(dataset_dir=dataset_dir, query_clause_as_pandas=True):
    clauses_dict = load_clauses_dict(dataset_dir)
    query_clauses = load_query_clause_dataset(dataset_dir, as_pandas=query_clause_as_pandas)
    
    return clauses_dict, query_clauses

In [27]:
clauses_dict, query_clauses = make_dataset()
len(clauses_dict.keys()), len(query_clauses)

(45, 49)

In [28]:
clauses_dict, query_clauses = load_dataset()
query_clauses

Unnamed: 0,id,clause_id,query,clause
0,100,1,Remove any major changes to house before leaving.,The tenant shall at the termination of this ag...
1,200,2,Take license if you're carrying out business.,The tenant shall himself obtain the license fo...
2,201,2,Make sure you're having a license if doing som...,The tenant shall himself obtain the license fo...
3,300,3,I won't provide any insurance or security cover.,All kinds of security arrangements insurances ...
4,400,4,Stay good with neighbors.,The tenant shall keep good relationship with n...
5,500,5,"The house is in good condition, as you agree w...",The tenant acknowledges that the premises are ...
6,600,6,"Keep the house clean, take care of all the gad...","The tenant shall, at his own expense, and at a..."
7,700,7,Please see to it that no damage is caused by y...,The tenant shall be responsible for damages ca...
8,800,8,Please keep moving and cleaning the lawns.,"The tenant shall mow, irrigate and maintain an..."
9,900,9,"You'll have to have a landline for yourself, a...",The tenant shall obtain a home telephone and m...
