## This notebook produces a listing of items requested online March - July 2023 which are also available to view online. The code exports the listing for use as a lookup table for a monthly graph of % requested items online / total items requested  to investigate the hypothesis that digitisation encourages researchers to request the physical items.

## First we create a snapshot of the api

In [None]:
import requests
import json
from pathlib import Path
from tqdm.auto import tqdm
import gzip
import io

snapshot_url = "https://data.wellcomecollection.org/catalogue/v2/works.json.gz"

In [None]:
data_dir = Path("./data").resolve()
data_dir.mkdir(exist_ok=True)

file_name = Path(snapshot_url).parts[-1]
zipped_path = data_dir / file_name
unzipped_path = zipped_path.with_suffix("")

if not unzipped_path.exists():
    if not zipped_path.exists():
        r = requests.get(snapshot_url, stream=True)
        download_progress_bar = tqdm(
            unit="B",
            total=int(r.headers["Content-Length"]),
            desc=f"downloading {file_name}",
        )
        with open(zipped_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    download_progress_bar.update(len(chunk))
        download_progress_bar.close()
        
    with gzip.open(zipped_path, "rb") as f_in, open(unzipped_path, "wb") as f_out:
        unzip_progress_bar = tqdm(
            unit="B",
            total=f_in.seek(0, io.SEEK_END),
            desc=f"unzipping {file_name}",
        )
        f_in.seek(0)
        for chunk in iter(lambda: f_in.read(1024 * 1024), b""):
            f_out.write(chunk)
            unzip_progress_bar.update(len(chunk))
    
        unzip_progress_bar.close()
    zipped_path.unlink()

## Either import data

In [None]:
#import pandas as pd
#data=pd.read_csv("data.csv")
#known_ids=data["id"].tolist()

## Or to test code without importing, here are 8 ids: 4 online, 4 not online

In [None]:
#known_ids=['p8nhsgje', 'y5wp8u7m', 'bxb3fadt', 'u3w8qbrt', 'kt2maez6', 'n4jt2jvn', 'rubmryr7', 's9fa8dnw']

In [None]:
def iterate_dataset():
    with open(unzipped_path, "r") as dataset:
        for line in dataset:
            yield json.loads(line)

works = []
for work in iterate_dataset():
    if work['id'] in known_ids:
        works.append(work)

In [None]:
df = pd.DataFrame(works)
df=df.fillna(" ") #to remove Nan's

## "availabilities" is a dictionary within a list. In order to filter, it needs to be flattened

In [None]:
df2=pd.DataFrame(df['availabilities'].to_list(), columns=['status','temp'])
df3=df[["id","availabilities"]].join(
    pd.DataFrame(df2['status']))
df3=df3.dropna()
df3['is_it_digitised?'] = df3['status'].apply(lambda x: x['id']=="online")
df4=df3[["id","is_it_digitised?"]]

In [None]:
print("The number of unique items requested is", len(df))

In [None]:
print("The number of unique digitised items requested is", len(df4.loc[(df4["is_it_digitised?"]==True)]))

In [None]:
df4.to_csv("df4.csv")

## [Here is a link to the graph of the monthly data.] (https://wellcomecloud.sharepoint.com/:x:/r/sites/wcdigitalexperience/Shared%20Documents/Analytics/Q4%20202223/10113%20test%20request%20data.xlsx?d=w9de5dddcfdd745a18aa7e4755193e9ee&csf=1&web=1&e=ogn3eT)