In [1]:
import json
import sqlite3
from os import listdir, makedirs

import numpy as np
import pandas as pd
from datasets import load_dataset

In [3]:
srcdir = 'rsp_txt'
dstdir = 'rsp_json'

makedirs(dstdir, exist_ok=True)

ids = []
ext = []

for fname in listdir(srcdir):
    with open(f'{srcdir}/{fname}', encoding='utf8') as f:
        text = f.read()
        # clean the text
        start= text.find("[/INST]")
        data = text[start+7:].strip()
        if data.startswith('```'):
            data = data[3:].strip()
        if data.startswith('json'):
            data = data[len('json'):].strip()
        if data.endswith('```'):
            data = data[:-3].strip()
        # try to parse and save the text
        try:
            jsondata = json.loads(data)
            with open(f'{dstdir}/{fname}.json', 'w', encoding='utf8') as f:
                print(data, file=f)
            ids.append(fname)
            ext.append(data)
        except ValueError as e:
            ids.append(fname)
            ext.append(None)

labels = pd.Series(ext, index=ids)
succ = labels.count()
fail = labels.isnull().sum()
tot  = succ + fail
print(f'ok : {succ}, fail: {fail}, tot: {tot}')

ok : 196, fail: 223, tot: 419


In [4]:
labels.name = 'documentdata'
labels

000.txt                                                 None
001.txt                                                 None
002.txt                                                 None
003.txt                                                 None
004.txt    {\n  "doctype": "REMISSION",\n  "act_date": "J...
                                 ...                        
414.txt                                                 None
415.txt    {\n  "doctype": "REMISSION",\n  "act_date": "U...
661.txt    {\n  "doctype": "REMISSION",\n  "act_date": "x...
691.txt    {\n  "doctype": "REMISSION",\n  "act_date": "1...
875.txt    {\n  "doctype": "REMISSION",\n  "act_date": "1...
Name: documentdata, Length: 419, dtype: object

In [5]:
# Initial computation is actually needed
ds = load_dataset('arch-be/brabant-xvii', name='doc_by_doc')
#
train = ds['train'].to_pandas()
test = ds['test'].to_pandas()
valid = ds['valid'].to_pandas()
# insert additional 'subset' column
train['subset'] = 'train'
test['subset'] = 'test'
valid['subset'] = 'valid'
# combine all subsets into one big dataframe
ds = pd.concat([train, test, valid], axis='index', ignore_index=True)
# append the utility columns (will be used to actually carry the labeling out)
ds['validated'] = False
# add some metadata to the index and columns for efficient hdf5 serialization
ds.index.name = 'id'
ds.subset = ds.subset.astype('category')
ds.project = ds.project.astype('category')
ds.file_id = ds.file_id.astype(str)
ds.text = ds.text.astype(str)
ds

Unnamed: 0_level_0,project,file_id,text,subset,validated
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,False
1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,False
2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,False
3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,False
4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,False
...,...,...,...,...,...
414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,False
415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,False
416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,False
417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,False


In [6]:
db = pd.merge(ds, labels, left_on='file_id', right_index=True)
db

Unnamed: 0_level_0,project,file_id,text,subset,validated,documentdata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""d..."
4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,False,
...,...,...,...,...,...,...
414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,False,
415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,False,
417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,False,


In [9]:
with open('../prompt.txt', encoding='utf8') as f:
    prompt = f.read()
    
def as_appstate(row):
    if row.documentdata is None:
        return None
    else:
        convers = [
            {'role': 'system',    'content': prompt},
            {'role': 'user',      'content': row.text},
            {'role': 'assistant', 'content': row.documentdata}
        ]
        docdata = json.loads(row.documentdata)
        docdata['document'] = row.text
        return json.dumps({
            'id': row.id,
            'document_data': docdata,
            'conversation': convers
        })

In [10]:
db['labeling'] = db.reset_index().apply(as_appstate, axis=1)
del db['documentdata']
db

Unnamed: 0_level_0,project,file_id,text,subset,validated,labeling
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,False,"{""id"": 0, ""document_data"": {""doctype"": ""REMISS..."
1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,False,"{""id"": 1, ""document_data"": {""doctype"": ""REMISS..."
2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,False,"{""id"": 2, ""document_data"": {""doctype"": ""REMISS..."
3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,False,"{""id"": 3, ""document_data"": {""doctype"": ""REMISS..."
4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,False,
...,...,...,...,...,...,...
414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,False,
415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,False,"{""id"": 415, ""document_data"": {""doctype"": ""REMI..."
416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,False,
417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,False,


In [11]:
with sqlite3.connect('dataset.db') as conn:
    db.to_sql('dataset', conn)