In [1]:
import json
import sqlite3
from os import listdir, makedirs

import numpy as np
import pandas as pd
from datasets import load_dataset

In [3]:
srcdir = 'rsp_txt'
dstdir = 'rsp_json'

makedirs(dstdir, exist_ok=True)

ids = []
ext = []

for fname in listdir(srcdir):
    with open(f'{srcdir}/{fname}', encoding='utf8') as f:
        text = f.read()
        # clean the text
        start= text.find("[/INST]")
        data = text[start+7:].strip()
        if data.startswith('```'):
            data = data[3:].strip()
        if data.startswith('json'):
            data = data[len('json'):].strip()
        if data.endswith('```'):
            data = data[:-3].strip()
        # try to parse and save the text
        try:
            jsondata = json.loads(data)
            with open(f'{dstdir}/{fname}.json', 'w', encoding='utf8') as f:
                print(data, file=f)
            ids.append(fname)
            ext.append(data)
        except ValueError as e:
            ids.append(fname)
            ext.append(None)

labels = pd.Series(ext, index=ids)
succ = labels.count()
fail = labels.isnull().sum()
tot  = succ + fail
print(f'ok : {succ}, fail: {fail}, tot: {tot}')

ok : 196, fail: 223, tot: 419


In [4]:
labels.name = 'documentdata'
labels

000.txt                                                 None
001.txt                                                 None
002.txt                                                 None
003.txt                                                 None
004.txt    {\n  "doctype": "REMISSION",\n  "act_date": "J...
                                 ...                        
414.txt                                                 None
415.txt    {\n  "doctype": "REMISSION",\n  "act_date": "U...
661.txt    {\n  "doctype": "REMISSION",\n  "act_date": "x...
691.txt    {\n  "doctype": "REMISSION",\n  "act_date": "1...
875.txt    {\n  "doctype": "REMISSION",\n  "act_date": "1...
Name: documentdata, Length: 419, dtype: object

In [5]:
# Initial computation is actually needed
ds = load_dataset('arch-be/brabant-xvii', name='doc_by_doc')
#
train = ds['train'].to_pandas()
test = ds['test'].to_pandas()
valid = ds['valid'].to_pandas()
# insert additional 'subset' column
train['subset'] = 'train'
test['subset'] = 'test'
valid['subset'] = 'valid'
# combine all subsets into one big dataframe
ds = pd.concat([train, test, valid], axis='index', ignore_index=True)
# append the utility columns (will be used to actually carry the labeling out)
ds['validated'] = False
# add some metadata to the index and columns for efficient hdf5 serialization
ds.index.name = 'id'
ds.subset = ds.subset.astype('category')
ds.project = ds.project.astype('category')
ds.file_id = ds.file_id.astype(str)
ds.text = ds.text.astype(str)
ds

Unnamed: 0_level_0,project,file_id,text,subset,validated
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,False
1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,False
2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,False
3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,False
4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,False
...,...,...,...,...,...
414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,False
415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,False
416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,False
417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,False


In [6]:
db = pd.merge(ds, labels, left_on='file_id', right_index=True)
db

Unnamed: 0_level_0,project,file_id,text,subset,validated,documentdata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""d..."
4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,False,
...,...,...,...,...,...,...
414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,False,
415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,False,"{\n ""doctype"": ""REMISSION"",\n ""act_date"": ""U..."
416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,False,
417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,False,


In [9]:
with open('../prompt.txt', encoding='utf8') as f:
    prompt = f.read()
    
def as_appstate(row):
    if row.documentdata is None:
        return None
    else:
        convers = [
            {'role': 'system',    'content': prompt},
            {'role': 'user',      'content': row.text},
            {'role': 'assistant', 'content': row.documentdata}
        ]
        docdata = json.loads(row.documentdata)
        docdata['document'] = row.text
        return json.dumps({
            'id': row.id,
            'document_data': docdata,
            'conversation': convers
        })

In [10]:
db['labeling'] = db.reset_index().apply(as_appstate, axis=1)
del db['documentdata']
db

Unnamed: 0_level_0,project,file_id,text,subset,validated,labeling
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,False,"{""id"": 0, ""document_data"": {""doctype"": ""REMISS..."
1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,False,"{""id"": 1, ""document_data"": {""doctype"": ""REMISS..."
2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,False,"{""id"": 2, ""document_data"": {""doctype"": ""REMISS..."
3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,False,"{""id"": 3, ""document_data"": {""doctype"": ""REMISS..."
4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,False,
...,...,...,...,...,...,...
414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,False,
415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,False,"{""id"": 415, ""document_data"": {""doctype"": ""REMI..."
416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,False,
417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,False,


In [11]:
with sqlite3.connect('dataset.db') as conn:
    db.to_sql('dataset', conn)

In [17]:
with sqlite3.connect('dataset.db') as conn:
    df = pd.read_sql("select * from dataset", conn)

df

Unnamed: 0,id,project,file_id,text,subset,validated,labeling
0,0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,0,"{""id"": 0, ""document_data"": {""doctype"": ""REMISS..."
1,1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,0,"{""id"": 1, ""document_data"": {""doctype"": ""REMISS..."
2,2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,0,"{""id"": 2, ""document_data"": {""doctype"": ""REMISS..."
3,3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,0,"{""id"": 3, ""document_data"": {""doctype"": ""REMISS..."
4,4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,0,
...,...,...,...,...,...,...,...
414,414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,0,
415,415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,0,"{""id"": 415, ""document_data"": {""doctype"": ""REMI..."
416,416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,0,
417,417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,0,


In [18]:
todo = df[df['labeling'].isnull()]
ll = todo.apply(lambda x: f"coucou {x.id}", axis=1)
ll.name = 'labeling'

df.update(ll)
df

Unnamed: 0,id,project,file_id,text,subset,validated,labeling
0,0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,0,"{""id"": 0, ""document_data"": {""doctype"": ""REMISS..."
1,1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,0,"{""id"": 1, ""document_data"": {""doctype"": ""REMISS..."
2,2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,0,"{""id"": 2, ""document_data"": {""doctype"": ""REMISS..."
3,3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,0,"{""id"": 3, ""document_data"": {""doctype"": ""REMISS..."
4,4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,0,coucou 4
...,...,...,...,...,...,...,...
414,414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,0,coucou 414
415,415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,0,"{""id"": 415, ""document_data"": {""doctype"": ""REMI..."
416,416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,0,coucou 416
417,417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,0,coucou 417


In [19]:
with sqlite3.connect('dataset.db') as conn:
    df = pd.read_sql("select * from dataset", conn)
    df.to_csv('dataset.csv')

In [21]:
d2 = pd.read_csv("dataset.csv")

In [25]:
df

Unnamed: 0,id,project,file_id,text,subset,validated,labeling
0,0,pardons,299.txt,kaerle etc doen te wetene allen iegewoirdich e...,train,0,"{""id"": 0, ""document_data"": {""doctype"": ""REMISS..."
1,1,pardons,228.txt,remissie voer\ndiericken gheerits\nkaerle etc....,train,0,"{""id"": 1, ""document_data"": {""doctype"": ""REMISS..."
2,2,pardons,118.txt,philips etc. allen etc. saluit alsoe als tonse...,train,0,"{""id"": 2, ""document_data"": {""doctype"": ""REMISS..."
3,3,pardons,126.txt,philips byder gratien goidts coninck van casti...,train,0,"{""id"": 3, ""document_data"": {""doctype"": ""REMISS..."
4,4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,0,
...,...,...,...,...,...,...,...
414,414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,0,
415,415,pardons,099.txt,remissie\nalbert ende isabel clara eugenia etc...,valid,0,"{""id"": 415, ""document_data"": {""doctype"": ""REMI..."
416,416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,0,
417,417,pardons,300.txt,kaerle etc. doen te wetene allen iegewoirdige ...,valid,0,


In [34]:
df.set_index('id').to_csv("dataset.csv", na_rep='null')

In [51]:
d2 = pd.read_csv("dataset.csv", index_col='id', na_values=['nan', 'null', 'none'])
d2[d2.labeling.isnull()]

Unnamed: 0_level_0,project,file_id,text,subset,validated,labeling
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,pardons,373.txt,remissie van dootslage\nphilips byder gracien ...,train,0,
6,pardons,187.txt,remissie van dootslage voer hanneken de huecke...,train,0,
8,pardons,280.txt,kaerle etc. doen te wete allen iegewoirdich en...,train,0,
10,pardons,067.txt,kaerle etc allen den ghenen er een hebben ontf...,train,0,
13,pardons,357.txt,kaerle etc doen te weten allen tegenwoirdich e...,train,0,
...,...,...,...,...,...,...
411,pardons,399.txt,kaerle etc. doen te weeten allen tegenwoirdich...,valid,0,
412,pardons,125.txt,kaerle etc doen conde allen luden nu ende nama...,valid,0,
414,pardons,012.txt,maximilian par la grace de dieu esleu empereur...,valid,0,
416,pardons,315.txt,kaerle etc. doen te wetene allen iegewoirdigen...,valid,0,


In [45]:
ls

 Le volume dans le lecteur C s'appelle OS
 Le num‚ro de s‚rie du volume est 5264-0A5B

 R‚pertoire de C:\Users\xavier.gillard\Documents\REPO\ai-rchivist\prepa-db

04-07-24  11:51    <DIR>          .
03-07-24  17:30    <DIR>          ..
03-07-24  22:38    <DIR>          .ipynb_checkpoints
04-07-24  10:22         8 462 023 dataset.csv
03-07-24  22:33         8 720 384 dataset.db
04-07-24  11:51         8 461 603 dataset_up.csv
04-07-24  10:26            60 015 prepa-db.ipynb
03-07-24  18:04    <DIR>          rsp_json
03-07-24  17:33    <DIR>          rsp_txt
               4 fichier(s)       25 704 025 octets
               5 R‚p(s)  107 627 024 384 octets libres


In [2]:
up = pd.read_csv("dataset_up.csv", na_values="null", index_col="id")

In [63]:
up.labeling.str.len().describe()

count       199.000000
mean      25768.773869
std       29158.072900
min       11728.000000
25%       20561.500000
50%       22734.000000
75%       24979.000000
max      408911.000000
Name: labeling, dtype: float64

In [5]:
print(up[up.labeling.isnull()].loc[416, 'text'])

kaerle etc. doen te wetene allen iegewoirdigen ende
toecommende dat wy ontfangen hebben die oetmoedige
supplicatie van gories peeters woenende tot tinnen in
onsen voirs. lande van namen vuerende aldaer die peerden
ende labuererende die lande in knaepscape inhoudende hoe
dat nu in septembri lestleden oft daerontrint een jaer geleden
is dat tot cumptich kermisse wesende de suppliant ende een
geheeten quinten vander werden zyn naeden noen maeltyt
tsamen gegaen ten vrient huyse te wetene ten huyse van henric
van lanen aldaer die suppliant ende die voirs. quinten met
meer andere geselscape hebben sitten drincken zonder dat die
suppliant metten voirs. quinten eenige woirden gehadt hadde
dat die suppliant sprekende metten voirs. quinten eenigen
woirden gehadt hadde dat die suppliant sprekende met den voirs.
quinten over tafele heeft geseyt dat die voirs. quinten beter
gevaren hadde dan hy suppliant aengemerct dat hy quinten
als ruytere oft voetknecht ontfangen hadde twee philips gulden
ende d

In [68]:
up[up.labeling.notnull()].text.str.len().describe()

count       199.000000
mean       8512.934673
std       14207.604037
min        2129.000000
25%        6526.500000
50%        7199.000000
75%        7990.000000
max      196197.000000
Name: text, dtype: float64

In [None]:
up