In [15]:
import sys, os
from dotenv import load_dotenv
import numpy as np
from datetime import datetime, timezone, timedelta
import pandas as pd

# own functions
sys.path.append(os.path.abspath("crypto_risk_analytics/src/analysis"))
sys.path.append(os.path.abspath("crypto_risk_analytics/src/config"))
import influxDB_utils as influx
import Mandelbrot


from openai import OpenAI
from sentence_transformers import SentenceTransformer
#import faiss
import hnswlib

#news
from newsapi import NewsApiClient
import praw


In [4]:
load_dotenv()
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
REDDIT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET = os.getenv("REDDIT_SECRET")

print(OPENAI_KEY)
print(NEWSAPI_KEY)

#client = OpenAI(api_key=OPENAI_KEY)

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
db580162657a4278b9d51810b7b67eed


In [None]:
# Embedding helper
def embed_texts(texts, model="text-embedding-3-small"):
    resp = client.embeddings.create(model=model, input=texts)
    return [np.array(e.embedding, dtype=np.float32) for e in resp.data]


In [13]:
datetime.now()

datetime.datetime(2025, 9, 7, 21, 47, 44, 446977)

In [30]:
#get news from newsapi
asset = "BTC"

news = NewsApiClient(api_key=NEWSAPI_KEY)
#print(news)

from_date = "2025-09-01"  #use here timedelta to implement different intervals

articles = news.get_everything(
    q=asset,
    from_param=from_date,
    to=datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S"),
    language="en",
    sort_by="relevancy",
    page_size=25
).get("articles", [])


articel_content = [a.get("title", "") + ". " + (a.get("description") or a.get("content") or "")
            for a in articles]

#clean up
cleaned = []
for a in articles:
    text = " ".join(filter(None, [a.get("title"), a.get("description"), a.get("content")]))
    cleaned.append({
            "asset": asset,
            "date": a.get("publishedAt"),
            "text": text,
            "source": a.get("source", {}).get("name")
        })
    

In [31]:
print(cleaned[0])

{'asset': 'BTC', 'date': '2025-09-05T10:46:04Z', 'text': '$3.38 Billion in Bitcoin Options Expiry Raises Concerns of September Volatility With $3.38 billion of Bitcoin BTC $110 973 24h volatility: 0.5% Market cap: $2.21 T Vol. 24h: $58.02 B options expiry on Sept. 5, BTC price is showing some... With $3.38 billion of Bitcoin options expiry on Sept. 5, BTC price is showing some strength with 1.77% upside. Now it has increased to $112,500 levels. Analysts are expecting September to be a month … [+2102 chars]', 'source': 'Coinspeaker'}


In [None]:

# Embedding helper
def embed_texts(texts, model="text-embedding-3-small"):
    resp = client.embeddings.create(model=model, input=texts)
    return [np.array(e.embedding, dtype=np.float32) for e in resp.data]

# Build hnswlib index
def build_hnsw(texts, dim=1536):
    embs = embed_texts(texts)
    idx = hnswlib.Index(space="l2", dim=dim)
    idx.init_index(max_elements=len(embs), ef_construction=200, M=16)
    idx.add_items(embs, list(range(len(embs))))
    idx.set_ef(50)
    return idx, embs

# Query index
def query_index(idx, texts, query, k=3):
    q_emb = embed_texts([query])[0]
    labels, _ = idx.knn_query(q_emb, k=k)
    return [texts[i] for i in labels[0]]

# Fetch functions
def fetch_news(keyword, days=7, limit=50):
    news = NewsApiClient(api_key=NEWSAPI_KEY)
    from_date = (datetime.utcnow() - timedelta(days=days)).isoformat()
    articles = news.get_everything(q=keyword, from_param=from_date,
                                   to=datetime.utcnow().isoformat(),
                                   language='en', sort_by='relevancy',
                                   page_size=limit).get("articles", [])
    return [a.get("title", "") + ". " + (a.get("description") or a.get("content") or "")
            for a in articles]

def fetch_reddit(keyword, subreddits=["CryptoCurrency"], limit=50):
    reddit = praw.Reddit(client_id=REDDIT_ID, client_secret=REDDIT_SECRET,
                         user_agent="crypto-risk-app/0.1")
    posts = []
    for sub in subreddits:
        for post in reddit.subreddit(sub).search(keyword, sort="new", limit=limit):
            posts.append(post.title + ". " + (post.selftext or ""))
    return posts

# Risk evaluation
def evaluate_risk(asset, hurst_val, context_texts):
    prompt = f"""
Asset: {asset}
Hurst exponent: {hurst_val:.3f}

Media context:
{"\n".join(context_texts)}

Give a JSON with keys:
 - asset
 - hurst_value
 - risk_level (low|medium|high)
 - explanation
    """
    resp = client.chat.completions.create(model="gpt-4o-mini",
                                          messages=[{"role":"user","content":prompt}],
                                          response_format={"type":"json_object"})
    return resp.choices[0].message.content

# Main pipeline
if __name__ == "__main__":
    asset = "BTC"
    # Assume hurst_value computed elsewhere
    hurst_value = 0.57

    news_texts = fetch_news(asset, days=7)
    reddit_texts = fetch_reddit(asset, ["CryptoCurrency", "Bitcoin"], limit=30)
    all_texts = news_texts + reddit_texts

    index, _ = build_hnsw(all_texts, dim=len(embed_texts([""])[0]))  # auto-dim
    context = query_index(index, all_texts, f"{asset} risk", k=5)

    risk_json = evaluate_risk(asset, hurst_value, context)
    print(risk_json)


In [None]:
# init LLM + embeddings
client = OpenAI()
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# get analysis from InfluxDB
def fetch_hurst(asset: str, interval: str, start: str = "2020-01-01") -> float:
    """Query InfluxDB and compute Hurst exponent for asset."""
    df = query_dataframe(asset=asset, interval=interval, start=start)
    if df.empty:
        return None
    h = hurst()
    return h.fit(df["return"].dropna().values, power=9)

In [None]:
def fetch_media(asset: str, days: int = 7):
    """Placeholder: fetch news + Reddit posts for the asset."""
    # TODO: replace with real APIs (Reddit, NewsAPI, etc.)
    return [
        f"{asset} adoption rises after exchange listing",
        f"Reddit buzz about {asset} volatility and risks",
        f"Analysts discuss regulation impact on {asset}",
    ]

In [None]:
def add_to_vectorstore(docs):
    """Embed and store documents in FAISS index."""
    global documents
    embeddings = embedder.encode(docs, convert_to_numpy=True)
    index.add(embeddings)
    documents.extend(docs)

In [None]:
def build_index_hnsw(embeddings):
    dim = embeddings.shape[1]
    p = hnswlib.Index(space='l2', dim=dim)
    p.init_index(max_elements=embeddings.shape[0], ef_construction=200, M=16)
    p.add_items(embeddings, ids=np.arange(embeddings.shape[0]))
    p.set_ef(50)
    return p

def retrieve_hnsw(index, docs, query_emb, k=5):
    labels, distances = index.knn_query(query_emb, k=k)
    return [docs[i] for i in labels[0]]

In [None]:
def risk_eval(asset: str, hurst_val: float, context_docs: list):
    """LLM combines hurst + context into risk evaluation."""
    prompt = f"""
    You are a financial risk analyst.
    Asset: {asset}
    Hurst exponent: {hurst_val:.3f} ( >0.5 trending, <0.5 mean-reverting )

    Recent media context:
    {chr(10).join(context_docs)}

    Based on this, give a structured JSON risk evaluation with:
    - risk_level: (low, medium, high)
    - rationale: short text
    """
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return resp.choices[0].message.content


In [None]:
embeddings = embedder.encode(doc_texts, convert_to_numpy=True)
index = build_index_hnsw(embeddings)

q_emb = embedder.encode([query], convert_to_numpy=True)
top_docs = retrieve_hnsw(index, doc_texts, q_emb, k=5)

In [None]:
# === pipeline run ===
if __name__ == "__main__":
    asset = "BTC"

    # 1. Market data
    hurst_val = fetch_hurst(asset, "Day")

    # 2. Media
    docs = fetch_media(asset)
    add_to_vectorstore(docs)

    # 3. RAG
    context = retrieve_context(f"{asset} risk factors", k=3)

    # 4. LLM evaluation
    result = risk_eval(asset, hurst_val, context)

    print(result)
