In [None]:
import lmdb
import json
from datetime import datetime
import numpy as np
import os
import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer
import weaviate
from dotenv import load_dotenv
load_dotenv()


In [None]:
class_name = "Event"
class_obj = {
    "class": class_name,
    "description": "Events",
    "vectorIndexConfig": {
        "distance": "cosine",
    },
    "properties": [
        {"name": "event_id", "dataType": ["string"]},
        {"name": "created_at", "dataType": ["int"]},
        {"name": "pubkey", "dataType": ["string"]},
        {"name": "kind", "dataType": ["int"]},
        {"name": "content", "dataType": ["string"]},
        # sig, tags
    ],
    "vectorizer": "text2vec-transformers",
}
client = weaviate.Client(
    url="http://localhost:8080",
)
# client.schema.create_class(class_obj)

In [None]:
# client.schema.delete_class('Event')

In [None]:
#  'id': 'ffff726c90d177b58b7b88add1301573469b09032f1a05f51b36c761e4b3a3d8',
#  'kind': 7,
#  'pubkey': 'c90e809dedf85e516b8d350b8d8d28088388e1133b7641a45a83e513db105e35',
#  'sig': '1fcaf7a55dbb33d290061509036e216c38b63e02abaedf4e2823bd0b96698dc3bac3c06492cbb7efac784c65929d9828316d71f320b79c42d3b9d3378bae6077',
#  'tags': [['e',
#    '5198baed543e8ce75c744d3fc8676c2797f649cdde5e072885e56a662cc888d0'],
#   ['p', 'c90e809dedf85e516b8d350b8d8d28088388e1133b7641a45a83e513db105e35']]}
start_time = datetime.now()
print("Start time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

env = lmdb.open(path=os.getenv("STRFRY_DB_FOLDER"), max_dbs=10)
payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")

record_count = 0
text_kinds = [1, 31922, 31923]
client.batch.configure(batch_size=100)
with client.batch as batch:
    with env.begin(db=id_db) as txn:
        # with env.begin(db=payload_db) as tpl:
        #     for key, value in txn.cursor():
        #         pl = tpl.get(value)
        #         if pl is None:
        #             print(value)
        #             continue
        cursor = txn.cursor(db=id_db)
        for key, value in cursor:
            pl = txn.get(value, db=payload_db)
            if pl is None:
                print(key, value)
                continue
            event_hex = key.hex()[:64]  # event_id
            event_json = json.loads(pl[1:].decode("utf-8"))
            if event_json["kind"] in text_kinds:
                emb = embedding_model.encode(event_json["content"])
                properties = {
                    "event_id": event_json["id"],
                    "created_at": event_json["created_at"],
                    "pubkey": event_json["pubkey"],
                    "kind": event_json["kind"],
                    "content": event_json["content"],
                }
                batch.add_data_object(
                    data_object=properties,
                    class_name=class_name,
                    vector=emb,
                )
                # record_count += 1
                # if record_count > 1000:
                #     break
end_time = datetime.now()
print("End time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))

# Calculate and print the duration
duration = end_time - start_time
print("Duration:", duration)


In [None]:
response = (
    client.query
    .get("Event", ["event_id", "pubkey", "kind", "content"])
    .with_limit(1)
    # .with_additional("vector")
    .with_additional(["distance", "id"])
    .do()
)
print(json.dumps(response, indent=4))

In [None]:
response = (
    client.query
    .aggregate("Event")
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
near_text_config = {
  "concepts": ["economics"],
#   "distance": 0.6,
#   "moveAwayFrom": {
#     "concepts": ["finance"],
#     "force": 0.45
#   },
#   "moveTo": {
#     "concepts": ["haute couture"],
#     "force": 0.85
#   }
}

response = (
    client.query
    .get("Event", ["event_id", "content"])
    .with_near_text(near_text_config)
    .with_limit(1)
    .with_additional(["distance", "id"])
    .do()
)

print(json.dumps(response, indent=4))

In [None]:
pubkey_value = "eab0e756d32b80bcd464f3d844b8040303075a13eabc3599a762c9ac7ab91f4f"

query = """
{
  Get {
    Event(
      where: {
        path: ["pubkey"]
        operator: Equal
        valueString: "%s"
      }
      limit: 10000
    ) {
      event_id
      created_at
      pubkey
      kind
      content
      _additional {
        vector
      }
    }
  }
}
""" % pubkey_value

result = client.query.raw(query)
events = result['data']['Get']['Event']

In [None]:
vectors = [event['_additional']['vector'] for event in events]
vectors_array = np.array(vectors)
mean_vector = np.mean(vectors_array, axis=0)

# event_ids = [event['event_id'] for event in events]

# # Create a DataFrame
# df_vec = pd.DataFrame(vectors, index=event_ids, columns=[f'feature_{i}' for i in range(len(vectors[0]))])


In [None]:
mean_vector[:5]

In [None]:
response = (
    client.query
    .get("Event", ["question"])
    .do()
)

In [None]:
def process_month(event_json):
    date = datetime.fromtimestamp(event_json['created_at'])
    month = date.strftime("%Y-%m")
    return month

def process_date(event_json):
    date = (
        datetime.fromtimestamp(event_json['created_at'])
        .strftime("%m-%d-%Y")
    )
    return date

def process_kind(event_json):
    return event_json['kind']

def process_db(process_fn):
    env = lmdb.open(path=os.getenv("STRFRY_DB_FOLDER"), max_dbs=10)
    payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
    id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")
    output_list = []

    with env.begin(db=id_db) as txn:
        cursor = txn.cursor(db=id_db)
        for key, value in cursor:
            pl = txn.get(value, db=payload_db)
            if pl is None:
                print(key, value)
                continue
            # event_hex = key.hex()[:64]  # event_id
            event_json = json.loads(pl[1:].decode("utf-8"))
            output_list.append(process_fn(event_json))
    return output_list

In [None]:
month_list = process_db(process_month)
event_counts = pd.Series(month_list).value_counts().sort_index()
fig = px.bar(event_counts, x=event_counts.index, y=event_counts.values, labels={'x': 'Month', 'y': 'Number of Events'})
fig.update_layout(title='Number of Events by Month', xaxis_title='Month', yaxis_title='Number of Events')
fig.show()

In [None]:
date_list = process_db(process_date)
event_counts = pd.Series(date_list).value_counts().sort_index()
fig = px.bar(event_counts, x=event_counts.index, y=event_counts.values, labels={'x': 'Date', 'y': 'Number of Events'})
fig.update_layout(title='Number of Events by Date', xaxis_title='Date', yaxis_title='Number of Events')
fig.show()

In [None]:
kind_list = process_db(process_kind)
counts = pd.Series(kind_list).value_counts().sort_index()
fig = px.bar(counts, x=list(map(str, counts.index)), y=counts.values, labels={'x': 'kind', 'y': 'Number of Events'})
fig.update_layout(title='Number of Events by kind', xaxis_title='kind', yaxis_title='Number of Events')
fig.show()
# 1, 30023


In [None]:
start_time = datetime.now()
print("Start time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))

end_time = datetime.now()
print("End time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))

# Calculate and print the duration
duration = end_time - start_time
print("Duration:", duration)

In [None]:
env = lmdb.open(path=os.getenv("STRFRY_DB_FOLDER"), max_dbs=10)
payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")


pubkey_counts = {}
with env.begin(db=id_db) as txn:
    with env.begin(db=payload_db) as tpl:
        for key, value in txn.cursor():
            pl = tpl.get(value)
            if pl is None:
                raise Exception("db corrupt!?")
            event_hex = key.hex()[:64]  # event_id
            event_json = json.loads(pl[1:].decode("utf-8"))  # event json
            pubkey = event_json['pubkey']
            if pubkey in pubkey_counts:
                pubkey_counts[pubkey] += 1
            else:
                pubkey_counts[pubkey] = 1

In [None]:
top_pubkeys = sorted(pubkey_counts, key=pubkey_counts.get, reverse=True)[:5]

for pubkey in top_pubkeys:
    print(f"PubKey: {pubkey}, Count: {pubkey_counts[pubkey]}")