In [1]:
import pandas as pd

csv_path = "../WebScraping/Transactions.csv"

df = pd.read_csv(csv_path)
print(df.head())


            Txn Hash           Type   Block               From  \
0  0x7bf7d...e5f4b87  Coin_Transfer  485665  0x4E5FF...9CC3C31   
1  0x4774f...a4fe408  Coin_Transfer  476179  0x400D4...d27cE1C   
2  0xd6147...7613120  Coin_Transfer  469467  0x400D4...d27cE1C   
3  0x08830...cbffce7  Coin_Transfer  463260  0x400D4...d27cE1C   
4  0x75689...541cc33  Coin_Transfer  451243  0x400D4...d27cE1C   

                  To                     Timestamp   Txn Fee  Value(CINT)  \
0  0x1f434...88444e0  2d ago\nJul, 15, 2025, 03:03  0.000008          0.1   
1  0xA48E1...a0D9630  2d ago\nJul, 14, 2025, 11:11  0.000000         20.0   
2  0x67694...b304ccf  3d ago\nJul, 13, 2025, 23:57  0.000000         20.0   
3  0x6bf85...1Aec29a  3d ago\nJul, 13, 2025, 13:34  0.000000         20.0   
4  0x75619...079EAB9  4d ago\nJul, 12, 2025, 17:27  0.000000         20.0   

    Status  
0  Success  
1  Success  
2  Success  
3  Success  
4  Success  


In [2]:
print(df.columns.tolist())


['Txn Hash', 'Type', 'Block', 'From', 'To', 'Timestamp', 'Txn Fee', 'Value(CINT)', 'Status']


In [3]:
import pandas as pd
import numpy as np

# 1. Load CSV
csv_path = "../WebScraping/Transactions.csv"
df = pd.read_csv(csv_path)

# 2. Drop duplicate Txn Hashes
df = df.drop_duplicates(subset=["Txn Hash"])

# 3. Standardize & rename columns
df = df.rename(columns=lambda c: (
    c.strip()
     .lower()
     .replace(" ", "_")
     .replace("(", "")
     .replace(")", "")
     .replace("-", "_")
))

df = df.rename(columns={
    "valuecint": "value_cint",    # rename the raw
    "txn_fee":   "txn_fee_cint",
})
# now drop any leftover
if "valuecint" in df.columns:
    df = df.drop(columns=["valuecint"])
# e.g. "Txn Hash" → "txn_hash", "Txn Fee (CINT)" → "txn_fee_cint"

# 4. Parse and normalize timestamps
df["timestamp_clean"] = (
    df["timestamp"]
      .astype(str)
      .str.split("\n")
      .str[-1]                # take the 'Jul, 15, 2025, 03:03' portion
)

df["timestamp"] = pd.to_datetime(df["timestamp_clean"], format="%b, %d, %Y, %H:%M", errors="coerce")
df = df.drop(columns="timestamp_clean")

# 5. Normalize address fields
for col in ["from", "to", "contract_address"]:
    if col in df:
        df[col] = df[col].str.lower().str.strip().replace("", np.nan)

# 6. Convert all on-chain numeric columns to floats/ints
df["block"]        = df["block"].astype(int)
df["value_cint"]   = df["value_cint"].astype(float)
df["txn_fee_cint"] = df["txn_fee_cint"].astype(float)


# 7. Derive domain-specific fields

# 7b. Classify txn direction for a given “our” address set
#    (you can supply your own key-addresses list)
our_addrs = {"0xabc…", "0xdef…"}  
def direction(row):
    if row["from"] in our_addrs and row["to"] not in our_addrs:
        return "outgoing"
    elif row["to"] in our_addrs and row["from"] not in our_addrs:
        return "incoming"
    else:
        return "other"
df["direction"] = df.apply(direction, axis=1)

# 8. Drop or archive any truly irrelevant fields
#    e.g., if ‘status’ is always ‘Success’, drop it; if you’ll never need raw input_data, drop that.
if "status" in df and df["status"].nunique() == 1:
    df = df.drop(columns=["status"])
for col in ["input_data", "logs"] :
    if col in df:
        df = df.drop(columns=[col])

# 9. Sort & re-index
df = df.sort_values("timestamp").reset_index(drop=True)

# 10. Final integrity checks
assert df["txn_hash"].is_unique, "Txn Hash is not unique!"
assert df[["block", "timestamp"]].notnull().all().all(), "Missing block or timestamp!"

df.head(10)


Unnamed: 0,txn_hash,type,block,from,to,timestamp,txn_fee_cint,value_cint,status,direction
0,0x10927...b039478,Contract_Creation,100047,0xd08b8...17bd86c,0x5c2b1...8241879,2025-06-18 06:01:00,8e-06,0.0,Success,other
1,0x09cf8...b2bace6,Contract_Creation,100055,0xd08b8...17bd86c,0xda990...92b972f,2025-06-18 06:02:00,8e-06,0.0,Success,other
2,0xdae48...30a14d6,Contract_Creation,100055,0xd08b8...17bd86c,0xb81ea...61b7583,2025-06-18 06:02:00,8e-06,0.0,Success,other
3,0x3e78c...fbce879,Contract_Creation,100055,0xd08b8...17bd86c,0x1aa27...35302d1,2025-06-18 06:02:00,8e-06,0.0,Success,other
4,0xbcbdf...537d2e2,Contract_Creation,100055,0xd08b8...17bd86c,0x39351...87193a2,2025-06-18 06:02:00,8e-06,0.0,Success,other
5,0xe670c...c90f019,Contract_Creation,100055,0xd08b8...17bd86c,0x1a6e3...5dccdf1,2025-06-18 06:02:00,8e-06,0.0,Success,other
6,0x4caa0...01d215f,Contract_Creation,100054,0xd08b8...17bd86c,0xc352f...8e61994,2025-06-18 06:02:00,8e-06,0.0,Success,other
7,0xae111...046aef8,Contract_Creation,100054,0xd08b8...17bd86c,0x4c040...1c62d85,2025-06-18 06:02:00,8e-06,0.0,Success,other
8,0x933d8...3fe2293,Contract_Creation,100055,0xd08b8...17bd86c,0x7d769...12b7e26,2025-06-18 06:02:00,8e-06,0.0,Success,other
9,0x12fb5...2b558a7,Contract_Creation,100054,0xd08b8...17bd86c,0x46d47...83c90c6,2025-06-18 06:02:00,8e-06,0.0,Success,other


In [10]:
!pip install --upgrade huggingface-hub==0.15.1 numpy scikit-learn

Collecting huggingface-hub==0.15.1
  Downloading huggingface_hub-0.15.1-py3-none-any.whl.metadata (8.0 kB)
Collecting numpy
  Downloading numpy-2.3.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     ------------------- ------------------ 30.7/60.9 kB 660.6 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/60.9 kB 890.4 kB/s eta 0:00:01
     -------------------------------------- 60.9/60.9 kB 405.2 kB/s eta 0:00:00
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata (14 kB)
Collecting numpy
  Downloading numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ---------------------------------------- 60.8/60.8 kB 1.6 MB/s eta 0:00:00
Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
   ---------------------------------------- 0.0/236.8 kB ? eta -:--:--
   ----------------------

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
accelerate 1.1.1 requires huggingface-hub>=0.21.0, but you have huggingface-hub 0.15.1 which is incompatible.
astropy 5.3.4 requires numpy<2,>=1.21, but you have numpy 2.2.6 which is incompatible.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.2.6 which is incompatible.
datasets 3.1.0 requires huggingface-hub>=0.23.0, but you have huggingface-hub 0.15.1 which is incompatible.
matplotlib 3.8.0 requires numpy<2,>=1.21, but you have numpy 2.2.6 which is incompatible.
numba 0.59.0 requires numpy<1.27,>=1.22, but you have numpy 2.2.6 which is incompatible.
peft 0.13.2 requires huggingface-hub>=0.17.0, but you have huggingface-hub 0.

In [None]:
import os
import pandas as pd
import numpy as np
from huggingface_hub import InferenceClient
from sklearn.metrics.pairwise import cosine_similarity

# ─────────────────────────────────────────────────────────────────────────────
# 1) Set your HF token (you should already have this in your MCP-VSCode env):
#    export HUGGINGFACEHUB_API_TOKEN="hf_xxx…"
# ─────────────────────────────────────────────────────────────────────────────
HF_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
client   = InferenceClient(token=HF_TOKEN)

# ─────────────────────────────────────────────────────────────────────────────
# 2) Load your preprocessed DataFrame (must already have run your cleaning code):
# ─────────────────────────────────────────────────────────────────────────────
# e.g. if you saved it:
# df = pd.read_csv("../WebScraping/Transactions_cleaned.csv", parse_dates=["timestamp"])
# Here, assume df is already in memory:

# ─────────────────────────────────────────────────────────────────────────────
# 3) Turn each row into a human‐readable “doc” string
# ─────────────────────────────────────────────────────────────────────────────
def row_to_doc(r):
    return (
        f"Txn {r.txn_hash}: {r.type} of {r.value_cint} CINT on "
        f"{r.timestamp.strftime('%Y-%m-%d %H:%M')} "
        f"from {r.from_address} to {r.to_address}, fee={r.txn_fee_cint}"
    )

docs = df.apply(row_to_doc, axis=1).tolist()
ids  = df["txn_hash"].tolist()

# ─────────────────────────────────────────────────────────────────────────────
# 4) Embed all docs with HF’s embeddings endpoint
#    (this calls the all-MiniLM core model hosted via your MCP servers)
# ─────────────────────────────────────────────────────────────────────────────
# Note: this may take a minute for large dfs — it’s batching internally.
embeddings = client.feature_extraction(
    model="sentence-transformers/all-MiniLM-L6-v2",
    inputs=docs
)
# embeddings is a List[List[float]] of shape (len(docs), D)
emb = np.array(embeddings)

# ─────────────────────────────────────────────────────────────────────────────
# 5) At query time: embed the user’s question, retrieve top-k, then generate
# ─────────────────────────────────────────────────────────────────────────────
def answer_query(query: str, top_k: int = 5):
    # 5a) Embed the query
    q_emb = np.array(client.feature_extraction(
        model="sentence-transformers/all-MiniLM-L6-v2",
        inputs=[query]
    )[0]).reshape(1, -1)

    # 5b) Find top_k similar docs by cosine similarity
    sims      = cosine_similarity(q_emb, emb)[0]
    best_idx  = sims.argsort()[-top_k:][::-1]
    context   = "\n".join(docs[i] for i in best_idx)

    # 5c) Assemble the prompt
    prompt = (
        "You are a blockchain analytics assistant. "
        "Here are some transactions that may be relevant:\n"
        f"{context}\n\n"
        f"User question: {query}\n"
        "Answer concisely:"
    )

    # 5d) Call the HF text-generation endpoint
    output = client.text_generation(
        model="HuggingFaceH4/zephyr-7b-beta",
        inputs=prompt,
        max_new_tokens=128,
        do_sample=False
    )
    return output.generated_text

# ─────────────────────────────────────────────────────────────────────────────
# 6) Try it out
# ─────────────────────────────────────────────────────────────────────────────
print(answer_query("Show me the three largest outgoing transactions in the last week."))


Error importing huggingface_hub.inference._client: No module named 'huggingface_hub.inference'
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\datta\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\datta\AppData\Local\Temp\ipykernel_41452\3058143426.py", line 10, in <module>
    from huggingface_hub import InferenceClient
  File "c:\Users\datta\anaconda3\Lib\site-packages\huggingface_hub\__init__.py", line 998, in __getattr__
  File "c:\Users\datta\anaconda3\Lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'huggingface_hub.inference'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\datta\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 2120, in showtraceback
    stb = self.