In [4]:
import wandb

import json
from tqdm.notebook import tqdm
from collections import defaultdict
import zipfile
from pathlib import Path

In [40]:
RAW_AT = 'parambharat/wandb_docs_bot_dev/wandbot_vectorindex:latest'

In [2]:
api = wandb.Api()
artifact = api.artifact(RAW_AT, type='vectorindex')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact wandbot_vectorindex:latest, 500.62MB. 12 files... 
[34m[1mwandb[0m:   12 of 12 files downloaded.  
Done. 0:0:1.4


In [3]:
path = Path(artifact_dir)
list(path.iterdir())

[PosixPath('artifacts/wandbot_vectorindex:v33/hyde_prompt.txt'),
 PosixPath('artifacts/wandbot_vectorindex:v33/datastore.json'),
 PosixPath('artifacts/wandbot_vectorindex:v33/config.json'),
 PosixPath('artifacts/wandbot_vectorindex:v33/sparse_retriever'),
 PosixPath('artifacts/wandbot_vectorindex:v33/metadata.json'),
 PosixPath('artifacts/wandbot_vectorindex:v33/dense_retriever')]

## Load data

In [5]:
with open(path/'datastore.json', mode="r") as f:
    docstore = json.load(f)

In [6]:
len(docstore["docs"])

10868

In [7]:
docs = docstore["docs"].values()

map: source -> text

a jsonl file:
```
{"source":"wandb.ai/intro", "text": "The text of the webpage...."}
{"source":"wandb.ai/launch", "text": "The text of the launch...."}
...
```


Then we can load with hf `datasets`...

In [15]:
d = defaultdict(list)
for doc in tqdm(docs, total=len(docs)):
    d[doc["extra_info"]["source"]].append(doc["text"])

  0%|          | 0/10868 [00:00<?, ?it/s]

we need to concat the docs:

In [21]:
for k in d.keys():
    d[k] = "\n".join(d[k])

In [33]:
one = next(iter(d.items()))

this is a tuple

In [38]:
type(one), one[0]

(tuple, 'https://docs.wandb.ai/quickstart')

In [32]:
from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))

lens = {k:len(v) for k,v in d.items()}
take(10, lens.items())

[('https://docs.wandb.ai/quickstart', 4524),
 ('https://docs.wandb.ai/tutorials/tensorflow', 5808),
 ('https://docs.wandb.ai/tutorials/huggingface', 4354),
 ('https://docs.wandb.ai/tutorials_to_tutorials', 751),
 ('https://docs.wandb.ai/tutorials/xgboost_sweeps', 7120),
 ('https://docs.wandb.ai/tutorials/tables', 7302),
 ('https://docs.wandb.ai/tutorials/lightgbm', 5923),
 ('https://docs.wandb.ai/tutorials/volcano', 5885),
 ('https://docs.wandb.ai/tutorials/pytorch', 11089),
 ('https://docs.wandb.ai/tutorials/prompts', 4873)]

Let's dump this into a JSONL file

In [47]:
ds_as_list = [{"source":tup[0], "text":tup[1]} for tup in d.items()]

In [48]:
table = wandb.Table(rows=ds_as_list, columns=["source", "text"])

In [50]:
out_file = 'data/dataset.jsonl'

In [51]:
with open(out_file, 'w') as outfile:
    for row in ds_as_list:
        json.dump(row, outfile)
        outfile.write('\n')

In [None]:
with wandb.init(project="wizard", job_type="preprocessing"):
    # link artifact
    wandb.use_artifact(RAW_AT)
    
    # log table
    wandb.log({"dataset_table": table})
    
    # log at
    at = wandb.Artifact("lm_dataset", type="dataset")
    at.add_file(out_file)
    wandb.log_artifact(at)

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin
