In [13]:
import sys, os
from dotenv import load_dotenv
import numpy as np
from datetime import datetime, timezone, timedelta
import pandas as pd

import time
import json
from typing import List, Dict


# own functions
from mom import influxDB_utils as influx
from mom import Mandelbrot


from openai import OpenAI
from sentence_transformers import SentenceTransformer
#import faiss
import hnswlib

#news
from newsapi import NewsApiClient
import praw


In [11]:
# Load env from project root
load_dotenv()

#the following entries are expected in the MOM_Crypto_Bot/.env
ASSET = os.getenv("ASSET")
CURRENCY = os.getenv("CURRENCY")

OPENAI_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
#OPENAI_TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", "0.0"))

NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
REDDIT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET = os.getenv("REDDIT_SECRET")

#print(OPENAI_KEY)
#print(NEWSAPI_KEY)


openai.api_key = OPENAI_KEY

#client = OpenAI(api_key=OPENAI_KEY)

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
db580162657a4278b9d51810b7b67eed


In [14]:
# ---- embedding model + index (global) ----
EMB_MODEL_NAME = "all-MiniLM-L6-v2"  # small & fast; change if you want larger
embed_model = SentenceTransformer(EMB_MODEL_NAME)
DIM = embed_model.get_sentence_embedding_dimension()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
# hnswlib index (in-memory). Persist/restore if needed.
_index = hnswlib.Index(space="cosine", dim=DIM)
_index_initialized = False
_articles_store = {}  # id -> metadata + text

def init_index(max_elements=20000):
    global _index_initialized
    if not _index_initialized:
        _index.init_index(max_elements=max_elements, ef_construction=200, M=16)
        _index.set_ef(50)
        _index_initialized = True

In [16]:
# ---- 1) Hurst helpers ----
def get_latest_hurst(asset: str, lookback_days=30):
    """Query InfluxDB for latest hurst value for asset."""
    client = influx.get_client()
    q = f'''
    from(bucket: "Hurst")
      |> range(start: -{lookback_days}d)
      |> filter(fn: (r) => r._measurement == "hurst" and r.asset == "{asset}")
      |> last()
    '''
    query_api = client.query_api()
    tables = query_api.query(org=influx.INFLUX_ORG, query=q)
    if not tables or not tables[0].records:
        return None
    rec = tables[0].records[0]
    # value might be in rec.get_value() or rec.get_field()
    try:
        value = float(rec.get_value())
    except Exception:
        value = float(rec.get_field()) if rec.get_field() else None
    return value

def categorize_hurst(h: float):
    if h is None:
        return "unknown"
    if h < 0.45:
        return "high_volatility"
    elif h < 0.55:
        return "random_walk"
    else:
        return "trending"


In [None]:
H = get_latest_hurst(ASSET)
h = categorize_hurst(H)

print(H,h)

In [None]:
# ---- 2) News fetch + preprocessing ----
newsapi = NewsApiClient(api_key=NEWSAPI_KEY)

def fetch_newsapi_articles(query: str, from_dt: datetime = None, page_size=50) -> List[Dict]:
    """Return list of article dicts: {title, description, content, url, publishedAt, source}"""
    params = {"q": query, "pageSize": page_size, "language": "en", "sortBy": "publishedAt"}
    if from_dt:
        params["from_param"] = from_dt.isoformat()
    res = newsapi.get_everything(**params)
    articles = res.get("articles", [])
    cleaned = []
    for a in articles:
        cleaned.append({
            "title": a.get("title") or "",
            "description": a.get("description") or "",
            "content": (a.get("content") or "")[:4000],  # cap
            "url": a.get("url"),
            "publishedAt": a.get("publishedAt"),
            "source": a.get("source", {}).get("name")
        })
    return cleaned

def chunk_text(txt: str, max_words=200):
    """Simple chunker by words."""
    words = txt.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i:i+max_words]))
    return chunks

In [None]:
# ---- 3) Indexing ----
def add_articles_to_index(articles: List[Dict], prefix_id=0):
    """Index article chunks. Returns dict of id->meta."""
    init_index(max_elements=max(10000, len(articles)*4))
    current_max_id = max(_articles_store.keys())+1 if _articles_store else 0
    idx = current_max_id
    texts = []
    metas = []
    for art in articles:
        full_text = " ".join([art["title"], art["description"], art["content"]])
        chunks = chunk_text(full_text, max_words=200)
        for chunk in chunks:
            texts.append(chunk)
            metas.append({
                "title": art["title"],
                "url": art["url"],
                "source": art["source"],
                "publishedAt": art["publishedAt"]
            })
    if not texts:
        return {}
    embeddings = embed_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    ids = list(range(idx, idx + len(texts)))
    _index.add_items(embeddings, ids)
    for i, m in zip(ids, metas):
        _articles_store[i] = {"text": texts[i-idx], "meta": m}
    return {i: _articles_store[i] for i in ids}

In [None]:
# ---- 4) Retrieval ----
def retrieve(query: str, k=5):
    vec = embed_model.encode([query], convert_to_numpy=True)
    labels, distances = _index.knn_query(vec, k=k)
    results = []
    for lid in labels[0]:
        if lid in _articles_store:
            results.append({**_articles_store[lid]["meta"], "text": _articles_store[lid]["text"], "id": lid})
    return results

# ---- 5) LLM prompt + call ----
def build_prompt(asset: str, hurst_value: float, hurst_cat: str, retrieved: List[Dict]):
    header = f"Asset: {asset}\nHurst: {hurst_value}\nHurst_category: {hurst_cat}\n\n"
    header += "You are a financial risk analyst. Using the Hurst signal (above) and the news snippets below, produce a concise risk evaluation (1-3 short paragraphs) with:\n"
    header += "- a single-line risk level (LOW / MEDIUM / HIGH)\n- a short justification referencing the Hurst signal and at least two article snippets by source+date.\n- suggestions for monitoring (what to watch next).\n\n"
    prompt = header + "News snippets (most relevant first):\n"
    for r in retrieved:
        pub = r.get("publishedAt")
        src = r.get("source")
        title = r.get("title", "") if "title" in r else ""
        snippet = r.get("text")[:800]
        prompt += f"\n[{src} | {pub}] {title}\n{snippet}\n---\n"
    prompt += "\nNow give the risk evaluation. Be concise and include explicit references to the Hurst signal and news snippets.\n"
    return prompt

def ask_llm(prompt: str, model: str = OPENAI_MODEL, temperature=OPENAI_TEMPERATURE, max_tokens=600):
    """Calls OpenAI ChatCompletion in a minimal single-turn fashion."""
    # Use Chat Completions API
    messages = [{"role":"system", "content": "You are a professional quantitative risk analyst."},
                {"role":"user", "content": prompt}]
    resp = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return resp.choices[0].message.content

# ---- 6) Orchestration ----
def run_rag_risk(asset="BTC", lookback_news_days=7, top_k=5):
    # 1) hurst
    h = get_latest_hurst(asset)
    hcat = categorize_hurst(h)
    # 2) fetch news
    since = datetime.now(timezone.utc) - timedelta(days=lookback_news_days)
    articles = fetch_newsapi_articles(asset, from_dt=since, page_size=50)
    if not articles:
        return {"error":"no articles"}
    # 3) index articles
    add_articles_to_index(articles)
    # 4) retrieve relevant items
    retrieved = retrieve(asset, k=top_k)
    # 5) build prompt and ask llm
    prompt = build_prompt(asset, h, hcat, retrieved)
    llm_result = ask_llm(prompt)
    out = {
        "asset": asset,
        "hurst_value": h,
        "hurst_cat": hcat,
        "retrieved": retrieved,
        "llm_result": llm_result,
        "timestamp": datetime.now(timezone.utc).isoformat()
    }
    return out

# example usage
if __name__ == "__main__":
    print("Running RAG evaluator for BTC...")
    result = run_rag_risk("BTC")
    print(result["llm_result"])

In [5]:
# Embedding helper
def embed_texts(texts, model="text-embedding-3-small"):
    resp = client.embeddings.create(model=model, input=texts)
    return [np.array(e.embedding, dtype=np.float32) for e in resp.data]


In [6]:
datetime.now()

datetime.datetime(2025, 9, 16, 20, 57, 48, 509094)

In [12]:
#get news from newsapi
asset = ASSET

news = NewsApiClient(api_key=NEWSAPI_KEY)
#print(news)

from_date = "2025-09-01"  #use here timedelta to implement different intervals

articles = news.get_everything(
    q=ASSET,
    from_param=from_date,
    to=datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S"),
    language="en",
    sort_by="relevancy",
    page_size=25
).get("articles", [])


articel_content = [a.get("title", "") + ". " + (a.get("description") or a.get("content") or "")
            for a in articles]

#clean up
cleaned = []
for a in articles:
    text = " ".join(filter(None, [a.get("title"), a.get("description"), a.get("content")]))
    cleaned.append({
            "asset": asset,
            "date": a.get("publishedAt"),
            "text": text,
            "source": a.get("source", {}).get("name")
        })
    

In [9]:
articel_content

['$3.38 Billion in Bitcoin Options Expiry Raises Concerns of September Volatility. With $3.38 billion of Bitcoin BTC $110 973 24h volatility: 0.5% Market cap: $2.21 T Vol. 24h: $58.02 B options expiry on Sept. 5, BTC price is showing some...',
 'BlackRock Continues to Offload Millions in Bitcoin, Ethereum, But for How Long?. BlackRock is the world’s largest asset manager and the leading institution in Bitcoin BTC $114 077 24h volatility: 1.5% Market cap: $2.27 T Vol. 24h: $52.57 ...',
 "Businesses Buy 1,755 Bitcoin Daily, Adding $1.3 Trillion in 20 Months – BTC Above $125K Next?. If you click 'Accept all', we and our partners, including 237 who are part of the IAB Transparency &amp; Consent Framework, will also store and/or access information on a device (in other words, use … [+714 chars]",
 'How did Tether profit $13B in 2024 from USDT?. How Tether turned $113B in Treasuries into $13B profit - USDT is most profitable model in crypto history',
 'Michael Saylor Breaks Silence on Big S&

In [None]:

# Embedding helper
def embed_texts(texts, model="text-embedding-3-small"):
    resp = client.embeddings.create(model=model, input=texts)
    return [np.array(e.embedding, dtype=np.float32) for e in resp.data]

# Build hnswlib index
def build_hnsw(texts, dim=1536):
    embs = embed_texts(texts)
    idx = hnswlib.Index(space="l2", dim=dim)
    idx.init_index(max_elements=len(embs), ef_construction=200, M=16)
    idx.add_items(embs, list(range(len(embs))))
    idx.set_ef(50)
    return idx, embs

# Query index
def query_index(idx, texts, query, k=3):
    q_emb = embed_texts([query])[0]
    labels, _ = idx.knn_query(q_emb, k=k)
    return [texts[i] for i in labels[0]]

# Fetch functions
def fetch_news(keyword, days=7, limit=50):
    news = NewsApiClient(api_key=NEWSAPI_KEY)
    from_date = (datetime.utcnow() - timedelta(days=days)).isoformat()
    articles = news.get_everything(q=keyword, from_param=from_date,
                                   to=datetime.utcnow().isoformat(),
                                   language='en', sort_by='relevancy',
                                   page_size=limit).get("articles", [])
    return [a.get("title", "") + ". " + (a.get("description") or a.get("content") or "")
            for a in articles]

def fetch_reddit(keyword, subreddits=["CryptoCurrency"], limit=50):
    reddit = praw.Reddit(client_id=REDDIT_ID, client_secret=REDDIT_SECRET,
                         user_agent="crypto-risk-app/0.1")
    posts = []
    for sub in subreddits:
        for post in reddit.subreddit(sub).search(keyword, sort="new", limit=limit):
            posts.append(post.title + ". " + (post.selftext or ""))
    return posts

# Risk evaluation
def evaluate_risk(asset, hurst_val, context_texts):
    prompt = f"""
Asset: {asset}
Hurst exponent: {hurst_val:.3f}

Media context:
{"\n".join(context_texts)}

Give a JSON with keys:
 - asset
 - hurst_value
 - risk_level (low|medium|high)
 - explanation
    """
    resp = client.chat.completions.create(model="gpt-4o-mini",
                                          messages=[{"role":"user","content":prompt}],
                                          response_format={"type":"json_object"})
    return resp.choices[0].message.content

# Main pipeline
if __name__ == "__main__":
    asset = "BTC"
    # Assume hurst_value computed elsewhere
    hurst_value = 0.57

    news_texts = fetch_news(asset, days=7)
    reddit_texts = fetch_reddit(asset, ["CryptoCurrency", "Bitcoin"], limit=30)
    all_texts = news_texts + reddit_texts

    index, _ = build_hnsw(all_texts, dim=len(embed_texts([""])[0]))  # auto-dim
    context = query_index(index, all_texts, f"{asset} risk", k=5)

    risk_json = evaluate_risk(asset, hurst_value, context)
    print(risk_json)


In [None]:
# init LLM + embeddings
client = OpenAI()
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# get analysis from InfluxDB
def fetch_hurst(asset: str, interval: str, start: str = "2020-01-01") -> float:
    """Query InfluxDB and compute Hurst exponent for asset."""
    df = query_dataframe(asset=asset, interval=interval, start=start)
    if df.empty:
        return None
    h = hurst()
    return h.fit(df["return"].dropna().values, power=9)

In [None]:
def fetch_media(asset: str, days: int = 7):
    """Placeholder: fetch news + Reddit posts for the asset."""
    # TODO: replace with real APIs (Reddit, NewsAPI, etc.)
    return [
        f"{asset} adoption rises after exchange listing",
        f"Reddit buzz about {asset} volatility and risks",
        f"Analysts discuss regulation impact on {asset}",
    ]

In [None]:
def add_to_vectorstore(docs):
    """Embed and store documents in FAISS index."""
    global documents
    embeddings = embedder.encode(docs, convert_to_numpy=True)
    index.add(embeddings)
    documents.extend(docs)

In [None]:
def build_index_hnsw(embeddings):
    dim = embeddings.shape[1]
    p = hnswlib.Index(space='l2', dim=dim)
    p.init_index(max_elements=embeddings.shape[0], ef_construction=200, M=16)
    p.add_items(embeddings, ids=np.arange(embeddings.shape[0]))
    p.set_ef(50)
    return p

def retrieve_hnsw(index, docs, query_emb, k=5):
    labels, distances = index.knn_query(query_emb, k=k)
    return [docs[i] for i in labels[0]]

In [None]:
def risk_eval(asset: str, hurst_val: float, context_docs: list):
    """LLM combines hurst + context into risk evaluation."""
    prompt = f"""
    You are a financial risk analyst.
    Asset: {asset}
    Hurst exponent: {hurst_val:.3f} ( >0.5 trending, <0.5 mean-reverting )

    Recent media context:
    {chr(10).join(context_docs)}

    Based on this, give a structured JSON risk evaluation with:
    - risk_level: (low, medium, high)
    - rationale: short text
    """
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return resp.choices[0].message.content


In [None]:
embeddings = embedder.encode(doc_texts, convert_to_numpy=True)
index = build_index_hnsw(embeddings)

q_emb = embedder.encode([query], convert_to_numpy=True)
top_docs = retrieve_hnsw(index, doc_texts, q_emb, k=5)

In [None]:
# === pipeline run ===
if __name__ == "__main__":
    asset = "BTC"

    # 1. Market data
    hurst_val = fetch_hurst(asset, "Day")

    # 2. Media
    docs = fetch_media(asset)
    add_to_vectorstore(docs)

    # 3. RAG
    context = retrieve_context(f"{asset} risk factors", k=3)

    # 4. LLM evaluation
    result = risk_eval(asset, hurst_val, context)

    print(result)
