In [None]:
%pip install python-terrier

In [None]:
import pyterrier as pt

if not pt.started():
    pt.init()

# Revert SSL configuration to default
import ssl
ssl._create_default_https_context = ssl.create_default_context


In [None]:
import pandas as pd
import os
import json
import numpy as np

# Specify the directory path relative to the notebook location
data_dir = os.path.abspath("../../../../chocolate_crawler/Crawled/")

# List of JSON files
json_files = ['laderach.json', 'spruengli.json', 'maxchocolatier.json']

# Initialize an empty list to store data from all JSON files
all_data = []

# Load data from each JSON file
for json_file in json_files:
    json_path = os.path.join(data_dir, json_file)

    print(f"Loading data from {json_path}")

    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            all_data.extend(data)
    except FileNotFoundError:
        print(f"File not found: {json_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in {json_path}: {e}")


# Create an index
idx = ['d' + str(i + 1) for i in range(len(all_data))]


# Extract relevant information
titles = [item.get('title', '') for item in all_data]
site = [item.get('page_link', '') for item in all_data]
image = [item.get('img_link', '') for item in all_data]
descriptions = [item.get('description', '') for item in all_data]
ingredients = [item.get('ingredients', '') for item in all_data]
allergens = [item.get('allergens', '') for item in all_data]
prices = [item.get('price', '') for item in all_data]

docs_df = pd.DataFrame({
    'docno': idx,
    'title': titles,
    'site': site,
    'img_link': image,
    'description': descriptions,
    'ingredients': ingredients,
    'allergens': allergens,
    'price': prices
})


# Convert DataFrame to a list of dictionaries
data_list = docs_df.to_dict(orient='records')

# Save the list of dictionaries as JSON
json_file_path = './index/chocolate.json'
with open(json_file_path, 'w') as json_file:
    json_file.write(json.dumps(data_list, indent=2))

docs_df.to_csv("./index/chocolate.csv", index=False)
# docs_df.to_json('./index/chocolate.json', index=False)



In [None]:
indexer = pt.DFIndexer("./index", overwrite=True)
index_ref = indexer.index(docs_df["description"], docs_df["docno"], docs_df["title"])
index_ref.toString()

In [None]:
!ls -lh index/

In [18]:
index = pt.IndexFactory.of(index_ref)

type(index)

jnius.reflect.org.terrier.structures.Index

In [None]:
print(index.getCollectionStatistics().toString())

In [None]:
for kv in index.getLexicon():
  print("%s  -> %s " % (kv.getKey(), kv.getValue().toString()  ))

In [None]:
# print(index.getLexicon()["document"].toString())
# print(index.getLexicon()["first"].toString())
# print(index.getLexicon()["topic"].toString())
# print(index.getLexicon()["unknown"].toString())

for kv in index.getLexicon() :
  print(kv.getKey())
  print(index.getLexicon()[kv.getKey()].toString())
  print('**************************************************')

In [None]:
br = pt.BatchRetrieve(index, wmodel="BM25")
br.search('white chocolate')