In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import lmdb
import json
from datetime import datetime, timezone
import numpy as np
import os
import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer
import weaviate
from dotenv import load_dotenv
load_dotenv()

client = weaviate.Client(
    url="http://localhost:8080",
)


In [None]:
os.getenv("STRFRY_DB_FOLDER")

In [48]:
from purple_py.strfry import read_strfy_db, get_content_for_embeddings, query_db_for_record, create_weaviate_record

In [49]:
# def vectorize_content
process_output = {}
process_output["event_id_list"] = []
process_output["content_list"] = []

process_output = read_strfy_db(
    client,
    process_fn=get_content_for_embeddings,
    process_output=process_output,
)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
process_output["embedding_list"] = embedding_model.encode(process_output["content_list"])

In [50]:
process_output["pubkey_dict"] = {}
process_output = query_db_for_record(
    client=client,
    process_fn=create_weaviate_record,
    process_input=process_output,
)

In [None]:
# def load_data_into_weaviate

In [None]:
len(event_id_list), len(content_list), len(embedding_list)

In [46]:

user_class_obj = {
    "class": "User",
    "description": "Users",
    "vectorIndexType": "hnsw",
    "vectorIndexConfig": {
        "skip": True,  # don't need to vector index users
    },
    "properties": [
        { "name": "pubkey", "dataType": ["text"]},
        { "name": "name", "dataType": ["text"]},
        { "name": "hasCreated", "dataType": ["Event"]}  # cross-reference
    ],
    "vectorizer": None,
}
# client.schema.delete_class('User')
client.schema.create_class(user_class_obj)

In [47]:
event_class_obj = {
    "class": "Event",
    "description": "Events",
    "vectorIndexType": "hnsw",
    "vectorIndexConfig": {
        "distance": "cosine",
    },
    "invertedIndexConfig": {
        "stopwords": {
            "preset": "en",
        }
    },
    "properties": [
        {
            "name": "event_id",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": True,
                    "vectorizePropertyName": False
                }
            }
        },
        {
            "name": "created_at",
            "dataType": ["date"],
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": True,
                    "vectorizePropertyName": False
                }
            }
        },
        {
            "name": "pubkey",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": True,
                    "vectorizePropertyName": False
                }
            }
        },
        {
            "name": "kind",
            "dataType": ["int"],
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": True,
                    "vectorizePropertyName": False
                }
            }
        },
        {
            "name": "content",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": False,
                    "vectorizePropertyName": False
                }
            }
        },
        # tags
    ],
    "vectorizer": "text2vec-transformers",
    'moduleConfig': {
        'text2vec-transformers': {
            'vectorizeClassName': False,
        }
    }
}
# client.schema.delete_class('Event')
client.schema.create_class(event_class_obj)

In [None]:
# get event by event id
# get all events for pubkey

event_query = {
  "where": {
    "operator": "Equal",
    "path": ["event_id"],
    "valueString": event_id
  }
}

event_response = (
    client.query
    .get("Event", ["created_at", "pubkey", "kind"])
    .with_where(event_query["where"])
    .do()
)
event = event_response['data']['Get']['Event']

In [61]:
# get all events for pubkey
pubkey = "17538dc2a62769d09443f18c37cbe358fab5bbf981173542aa7c5ff171ed77c4"  # elsat

pubkey_query = {
  "where": {
    "operator": "Equal",
    "path": ["pubkey"],
    "valueString": pubkey
  }
}

user_response = (
    client.query
    .get("User", ["pubkey"])
    # .get("User", [
    #     "hasCreated { ... on Event { event_id created_at pubkey kind content } }"
    # ])
    # .with_where(pubkey_query["where"])
    .do()
)

event_response = (
    client.query
    .get("Event", ["event_id"])
    # .get("User", [
    #     "hasCreated { ... on Event { event_id created_at pubkey kind content } }"
    # ])
    # .with_where(pubkey_query["where"])
    .do()
)

# Execute the query
created_response = (
    client.query
    .get("User", ["pubkey"])
    .with_where(pubkey_query["where"])
    .do()
)

# Extracting the events associated with the user
pubkey_events = user_response['data']['Get']['User']
events_output = event_response['data']['Get']['Event']
# for user in events:
#     if 'hasCreated' in user and user['hasCreated'] is not None:
#         for event in user['hasCreated']:
#             print(event)


In [62]:
events_output

[{'event_id': '5ff7d82f23e955d99ba2af8437581b40702e0f9d2032c99e61e31e60cc744405'},
 {'event_id': '5278a6e4985a347292b8a7205e138717bf96076ad9648abbce39250fa0022b80'},
 {'event_id': 'bd869ae8d06aeb9f77b29e455943ed26c4b64371265a15300d293db9a9159570'},
 {'event_id': '0c1ba6625193064d45b9230ec86a3ad3f36df279ccf23516096b904466585a3e'},
 {'event_id': '0a04e4db3914a9e620c727d37f6650310c37bbf1c45089134c3c892205bd0909'},
 {'event_id': 'fbfbb399f00d311995181943a51d63dbb7fc1af1b304c56ce714538df54af4b8'},
 {'event_id': 'a4869cc73fd6c23323489bb3cb2b3aee90179a08c42c81779d7825ca68ffc07c'},
 {'event_id': '4e9967d462e6000bc08867fe8d1dfc87fda08e2a3db131efaa817fa53e9c8656'},
 {'event_id': '2e692ef1a5f7b404d946ad549e909c57c1b5e300f653e0cfafd5837830c0ea48'},
 {'event_id': 'b734b4ddd66c2cc1ba785dceb68623a7706bb6f63c0b6b5252d92ab3bf631fb6'},
 {'event_id': '5d573e4f97ae6b28748d25889c9a281fb294f58918b7c16cf9ffedd6f34564e1'},
 {'event_id': '1ca89f65cdc9314560c343684aa2bf5f04f5ae0546efcfe0e00498bad514a7f6'},
 {'e

In [54]:
pubkey_events

[{'pubkey': 'f3f5992cdb39e6108768d543fbd384a11efc3713085617ee28932ebb1614e07c'},
 {'pubkey': '7dc1677112f05eaf49547806543b1c006ce3257278e52b1c9abff63270ed704f'},
 {'pubkey': 'fcf70a45cfa817eaa813b9ba8a375d713d3169f4a27f3dcac3d49112df67d37e'},
 {'pubkey': '6b0a60cff3eca5a2b2505ccb3f7133d8422045cbef40f3d2c6189fb0b952e7d4'},
 {'pubkey': 'fb7d9edb022881ac80da6369832f67e300f06d8524a2a55d1aa88aed51b481ba'},
 {'pubkey': '6e75f7972397ca3295e0f4ca0fbc6eb9cc79be85bafdd56bd378220ca8eee74e'},
 {'pubkey': 'a96a35a224402b8075c4da20f0477896afcc3395b6fad63e30a648a8222a6a69'},
 {'pubkey': '7d33ba57d8a6e8869a1f1d5215254597594ac0dbfeb01b690def8c461b82db35'},
 {'pubkey': 'eb119234c467ac9d2ffea5b7284f3a74bd04287a12cfd58a22d19626434cddf2'},
 {'pubkey': 'bf95e1a45bd5ec5ca0d420f4f55cd01645c7849f57fdd13b8c5afec4ebdcc848'},
 {'pubkey': '2edbcea694d164629854a52583458fd6d965b161e3c48b57d3aff01940558884'},
 {'pubkey': 'fe200a4937fbf805c079fe967bc80799f0321822a048a7150ddeae12f3c49a48'},
 {'pubkey': 'fea400befeb5bf1

In [None]:
strfry_path = os.getenv("STRFRY_DB_FOLDER")
batch_size = os.getenv("WEAVIATE_CLIENT_BATCH_SIZE")
batch_size = int(batch_size) if batch_size else 1000
min_content_length = os.getenv("MIN_CONTENT_LENGTH")
min_content_length = int(min_content_length) if min_content_length else 50
min_num_events = os.getenv("MIN_NUM_EVENTS")
min_num_events = int(min_num_events) if min_num_events else 5

page_limit = 1000

In [None]:
# 1st pass: get all content > min_length, get embeddings
start_time = datetime.now()
print("Start time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))

env = lmdb.open(path=strfry_path, max_dbs=10)
payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")

# record_count = 0
# pubkey_dict = {}
# event_dict = {}

text_kinds = [1, 31922, 31923]
content_list = []
event_id_list = []
client.batch.configure(batch_size=batch_size)
with client.batch as batch:
    with env.begin(db=id_db) as txn:
        cursor = txn.cursor(db=id_db)
        for key, value in cursor:
            pl = txn.get(value, db=payload_db)
            if pl is None:
                print(key, value)
                continue
            event_hex = key.hex()[:64]  # event_id
            event_json = json.loads(pl[1:].decode("utf-8"))
            if "content" not in event_json or len(event_json['content']) < min_content_length:
                continue
            if event_json["kind"] in text_kinds:
                event_id = event_json["id"]
                content = event_json['content']
                if event_id not in event_id_list:
                    content_list.append(content)
                    event_id_list.append(event_id)
end_time = datetime.now()
print("End time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))

# Calculate and print the duration
duration = end_time - start_time
print("Duration:", duration)


In [None]:
len(event_id_list), len(content_list)

In [None]:
len(event_id_list), len(content_list)

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_list = embedding_model.encode(content_list)

In [None]:
# 2nd pass: load events and users by querying events
start_time = datetime.now()
print("Start time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))

env = lmdb.open(path=strfry_path, max_dbs=10)
payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")

record_count = 0
text_kinds = [1, 31922, 31923]
pubkey_dict = {}
client.batch.configure(batch_size=batch_size)

with client.batch as batch:
    with env.begin(db=id_db) as txn:
        # for event_id, embedding in zip(event_id_list, embedding_list):
        for event_id in event_id_list:
            db_key = bytes.fromhex(event_id)
            cur = txn.cursor()
            cur.set_range(db_key)
            k, v = cur.item()
            if k[:32] == event_id:
                pl = tpl.get(v)
                if pl:
                    ev = pl[1:].decode('utf-8')
            
            value = txn.get(db_key, db=id_db)
            if value is None:
                print("Event ID not found:", event_id)
                continue
            pl = txn.get(value, db=payload_db)
            if pl is None:
                print("No payload for event ID:", event_id)
                continue

            event_json = json.loads(pl[1:].decode("utf-8"))
            pubkey = event_json["pubkey"]
            dt_utc = datetime.fromtimestamp(event_json["created_at"], timezone.utc)
            event_properties = {
                "event_id": event_id,
                "created_at": dt_utc.isoformat(),
                "pubkey": pubkey,
                "kind": event_json["kind"],
                "content": event_json["content"],
            }
            batch.add_data_object(
                data_object=event_properties,
                class_name="Event",
                # vector=embedding,
            )
            if pubkey not in pubkey_dict:
                pubkey_dict[pubkey] = None
                user_properties = { "pubkey": pubkey,} 
                batch.add_data_object(
                    data_object=user_properties,
                    class_name="User",
                )
            break
end_time = datetime.now()
duration = end_time - start_time
print("End time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))
print("Duration:", duration)


In [None]:
db_key

In [None]:
# 1st pass: get all content > min_length, get embeddings
content_list = []
event_id_list = []
offset = 0
while True:
    events_response = client.query.get(
        "Event",
        ["event_id", "content"]
        # ["event_id", "content", "_additional { id }"]
    ).with_limit(page_limit).with_offset(offset).do()

    events = events_response["data"]["Get"]["Event"]
    if events is None:
        break
    for event in events:
        event['event_id']
        content = event['content']
        if len(content) > min_content_length:
            content_list.append(event['content'])
            event_id_list.append(event['event_id'])
    if len(events) < page_limit:
        break
    offset += page_limit

In [None]:
# 2nd pass: load events and users
start_time = datetime.now()
print("Start time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))

env = lmdb.open(path=strfry_path, max_dbs=10)
payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")

record_count = 0
text_kinds = [1, 31922, 31923]
pubkey_dict = {}
event_dict = {}
content_list = []
event_id_list = []
client.batch.configure(batch_size=batch_size)
with client.batch as batch:
    with env.begin(db=id_db) as txn:
        cursor = txn.cursor(db=id_db)
        for key, value in cursor:
            pl = txn.get(value, db=payload_db)
            if pl is None:
                print(key, value)
                continue
            event_hex = key.hex()[:64]  # event_id
            event_json = json.loads(pl[1:].decode("utf-8"))
            if "content" not in event_json or len(event_json['content']) < min_content_length:
                continue
            if event_json["kind"] in text_kinds:
                pubkey = event_json["pubkey"]
                event_id = event_json["id"]
                content = event_json['content']

                if event_id not in event_id_list:
                    content_list.append(content)
                    event_id_list.append(event_id)
            
                    event_properties = {
                        "event_id": event_id,
                        "created_at": event_json["created_at"],
                        "pubkey": pubkey,
                        "kind": event_json["kind"],
                        "content": event_json["content"],
                    }
                    batch.add_data_object(
                        data_object=event_properties,
                        class_name="Event",
                        #vector=emb,
                    )
                    if pubkey not in pubkey_dict:
                        pubkey_dict[pubkey] = None
                        user_properties = {
                            "pubkey": pubkey,
                        }
                        batch.add_data_object(
                            data_object=user_properties,
                            class_name="User",
                        )
# embeddings = embedding_model.encode(content_list)


end_time = datetime.now()
print("End time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))

# Calculate and print the duration
duration = end_time - start_time
print("Duration:", duration)


In [None]:
# create cross-reference

start_time = datetime.now()
print("Start time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))


offset = 0
user_uuids_dict = {}
with client.batch as batch:
    while True:
        events_response = client.query.get(
            "Event",
            ["event_id", "content", "pubkey", "_additional { id }"]
        ).with_limit(page_limit).with_offset(offset).do()

        events = events_response["data"]["Get"]["Event"]
        if not events:
            break
        for event in events:
            pubkey = event["pubkey"]
            user_uuid = user_uuids_dict.get(pubkey)

            if not user_uuid:
                pubkey_query = {
                    "where": {
                        "operator": "Equal",
                        "path": ["pubkey"],
                        "valueString": pubkey
                    }
                }

                user_response = (
                    client.query
                    .get("User", ["_additional { id }"])
                    .with_where(pubkey_query["where"])
                    .do()
                )
                if user_response["data"]["Get"]["User"]:
                    user_uuid = user_response["data"]["Get"]["User"][0]["_additional"]["id"]
                    user_uuids_dict[pubkey] = user_uuid

            if user_uuid:
                batch.add_reference(
                    from_object_uuid=user_uuid,
                    from_object_class_name="User",
                    from_property_name="hasCreated",
                    to_object_uuid=event["_additional"]["id"],
                    to_object_class_name="Event"
                )
        if len(events) < page_limit:
            break
        offset += page_limit

end_time = datetime.now()
print("End time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))

# Calculate and print the duration
duration = end_time - start_time
print("Duration:", duration)


In [None]:
users = client.query.get("User", ["name", "pubkey", "hasCreated"]).do()
for user in users["data"]["Get"]["User"]:
    print(f"User: {user['name']} has created events: {user['hasCreated']}")


In [None]:
users

In [None]:
event["_additional"]["id"]

In [None]:
user_response["data"]["Get"]["User"][:5]

In [None]:
events

In [None]:
embeddings = embedding_model.encode(content_list)

In [None]:
embeddings[0]

In [None]:
len(embeddings)

In [None]:
len(content_list), len(event_id_list)

In [None]:
# query pubkeys with > 5 events
offset = 0
event_counts = {}
while True:
    events_response = client.query.get(
        "Event",
        # ["event_id", "content", "pubkey"]
        ["event_id", "content", "pubkey", "_additional { id }"]
    ).with_limit(page_limit).with_offset(offset).do()

    events = events_response["data"]["Get"]["Event"]
    if events is None:
        break
    for event in events:
        if len(event['content']) > min_content_length:
            pubkey = event['pubkey']
            event_counts[pubkey] = event_counts.get(pubkey, 0) + 1
    if len(events) < page_limit:
        break
    offset += page_limit

pubkeys_with_events = [
    pubkey for pubkey, count in event_counts.items() if count >= min_num_events
]

print(pubkeys_with_events)


In [None]:
events = events_response["data"]["Get"]["Event"]
events[0]

In [None]:
response = (
    client.query
    .get("Event", ["event_id", "pubkey", "kind", "content"])
    .with_limit(1)
    # .with_additional("vector")
    .with_additional(["distance", "id"])
    .do()
)
print(json.dumps(response, indent=4))

In [None]:
response = (
    client.query
    .aggregate("Event")
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
near_text_config = {
  "concepts": ["hair typo"],
}

response = (
    client.query
    .get("Event", ["event_id", "content"])
    .with_near_text(near_text_config)
    .with_limit(100)
    .with_additional(["distance", "id"])
    .do()
)

result = response['data']['Get']['Event']
# print(json.dumps(response, indent=4))

In [None]:
pubkey_value = "eab0e756d32b80bcd464f3d844b8040303075a13eabc3599a762c9ac7ab91f4f"

query = """
{
  Get {
    Event(
      where: {
        path: ["pubkey"]
        operator: Equal
        valueString: "%s"
      }
      limit: 10000
    ) {
      event_id
      created_at
      pubkey
      kind
      content
      _additional {
        vector
      }
    }
  }
}
""" % pubkey_value

result = client.query.raw(query)
events = result['data']['Get']['Event']

In [None]:
vectors = [event['_additional']['vector'] for event in events]
vectors_array = np.array(vectors)
mean_vector = np.mean(vectors_array, axis=0)

# event_ids = [event['event_id'] for event in events]

# # Create a DataFrame
# df_vec = pd.DataFrame(vectors, index=event_ids, columns=[f'feature_{i}' for i in range(len(vectors[0]))])


In [None]:
mean_vector[:5]

In [None]:
response = (
    client.query
    .get("Event", ["question"])
    .do()
)

In [None]:
def process_month(event_json):
    date = datetime.fromtimestamp(event_json['created_at'])
    month = date.strftime("%Y-%m")
    return month

def process_date(event_json):
    date = (
        datetime.fromtimestamp(event_json['created_at'])
        .strftime("%m-%d-%Y")
    )
    return date

def process_kind(event_json):
    return event_json['kind']

def process_db(process_fn):
    env = lmdb.open(path=os.getenv("STRFRY_DB_FOLDER"), max_dbs=10)
    payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
    id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")
    output_list = []

    with env.begin(db=id_db) as txn:
        cursor = txn.cursor(db=id_db)
        for key, value in cursor:
            pl = txn.get(value, db=payload_db)
            if pl is None:
                print(key, value)
                continue
            # event_hex = key.hex()[:64]  # event_id
            event_json = json.loads(pl[1:].decode("utf-8"))
            output_list.append(process_fn(event_json))
    return output_list

In [None]:
month_list = process_db(process_month)
event_counts = pd.Series(month_list).value_counts().sort_index()
fig = px.bar(event_counts, x=event_counts.index, y=event_counts.values, labels={'x': 'Month', 'y': 'Number of Events'})
fig.update_layout(title='Number of Events by Month', xaxis_title='Month', yaxis_title='Number of Events')
fig.show()

In [None]:
date_list = process_db(process_date)
event_counts = pd.Series(date_list).value_counts().sort_index()
fig = px.bar(event_counts, x=event_counts.index, y=event_counts.values, labels={'x': 'Date', 'y': 'Number of Events'})
fig.update_layout(title='Number of Events by Date', xaxis_title='Date', yaxis_title='Number of Events')
fig.show()

In [None]:
kind_list = process_db(process_kind)
counts = pd.Series(kind_list).value_counts().sort_index()
fig = px.bar(counts, x=list(map(str, counts.index)), y=counts.values, labels={'x': 'kind', 'y': 'Number of Events'})
fig.update_layout(title='Number of Events by kind', xaxis_title='kind', yaxis_title='Number of Events')
fig.show()
# 1, 30023


In [None]:
start_time = datetime.now()
print("Start time:", start_time.strftime("%Y-%m-%d %H:%M:%S"))

end_time = datetime.now()
print("End time:", end_time.strftime("%Y-%m-%d %H:%M:%S"))

# Calculate and print the duration
duration = end_time - start_time
print("Duration:", duration)

In [None]:
env = lmdb.open(path=os.getenv("STRFRY_DB_FOLDER"), max_dbs=10)
payload_db = env.open_db(b"rasgueadb_defaultDb__EventPayload")
id_db = env.open_db(b"rasgueadb_defaultDb__Event__id")


pubkey_counts = {}
with env.begin(db=id_db) as txn:
    with env.begin(db=payload_db) as tpl:
        for key, value in txn.cursor():
            pl = tpl.get(value)
            if pl is None:
                raise Exception("db corrupt!?")
            event_hex = key.hex()[:64]  # event_id
            event_json = json.loads(pl[1:].decode("utf-8"))  # event json
            pubkey = event_json['pubkey']
            if pubkey in pubkey_counts:
                pubkey_counts[pubkey] += 1
            else:
                pubkey_counts[pubkey] = 1

In [None]:
top_pubkeys = sorted(pubkey_counts, key=pubkey_counts.get, reverse=True)[:5]

for pubkey in top_pubkeys:
    print(f"PubKey: {pubkey}, Count: {pubkey_counts[pubkey]}")