In [7]:
import json
import pandas as pd

# Load your JSON file (assuming it's a list of transactions)
with open(r"C:\Users\yashk\Downloads\user-wallet-transactions.json","r") as f:
    raw_data = json.load(f)

print(f"Total transactions: {len(raw_data)}")


Total transactions: 100000


In [11]:
from pandas import json_normalize

# Flatten the list of dictionaries
df = json_normalize(raw_data)

# Preview the dataframe
print(df.head())
print(df.columns.tolist())

                                   userWallet  network protocol  \
0  0x00000000001accfa9cef68cf5371a23025b6d4b6  polygon  aave_v2   
1  0x000000000051d07a4fb3bd10121a343d85818da6  polygon  aave_v2   
2  0x000000000096026fb41fc39f9875d164bd82e2dc  polygon  aave_v2   
3  0x000000000096026fb41fc39f9875d164bd82e2dc  polygon  aave_v2   
4  0x0000000000e189dd664b9ab08a33c4839953852c  polygon  aave_v2   

                                              txHash  \
0  0x695c69acf608fbf5d38e48ca5535e118cc213a89e3d6...   
1  0xe6fc162c86b2928b0ba9b82bda672763665152b9de9d...   
2  0xe2d7eb815c89331a734ed6f204a06c385a1b39040baa...   
3  0x0d63a2eacd82b82f868db825ea7385e6bd8d046ee729...   
4  0x590eabb812c5006a6f4766f44e6e9d3ad0b5b563de69...   

                                               logId   timestamp  blockNumber  \
0  0x695c69acf608fbf5d38e48ca5535e118cc213a89e3d6...  1629178166   1629178166   
1  0xe6fc162c86b2928b0ba9b82bda672763665152b9de9d...  1621525013   1621525013   
2  0xe2d7eb815c89

In [13]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

# Token decimals
decimals = {"USDC": 6, "WMATIC": 18}
df['amount_normalized'] = df.apply(lambda r: float(r['actionData.amount']) / (10 ** decimals.get(r['actionData.assetSymbol'], 18)), axis=1)
df['price_usd'] = df['actionData.assetPriceUSD'].astype(float)
df['amount_usd'] = df['amount_normalized'] * df['price_usd']
df['wallet'] = df['userWallet']
df['asset'] = df['actionData.assetSymbol']
df['date'] = df['timestamp'].dt.date

In [15]:
features = df.groupby('wallet').agg(
    total_tx_count=('wallet', 'count'),
    total_deposit_usd=('amount_usd', 'sum'),
    avg_deposit_usd=('amount_usd', 'mean'),
    asset_diversity=('asset', pd.Series.nunique),
    activity_days=('date', pd.Series.nunique),
    first_tx=('timestamp', 'min'),
    last_tx=('timestamp', 'max')
).reset_index()

In [17]:
features['account_age_days'] = (features['last_tx'] - features['first_tx']).dt.days + 1
features['tx_frequency'] = features['total_tx_count'] / features['account_age_days']
features.fillna(0, inplace=True)


In [21]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Select only numeric features
X = features[[
    'total_tx_count', 'total_deposit_usd', 'avg_deposit_usd',
    'asset_diversity', 'activity_days', 'account_age_days', 'tx_frequency'
]]

# Normalize for clustering
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
# Apply KMeans to simulate credit score labels
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Simulate credit scores (higher cluster → higher score)
features['simulated_score'] = (pd.Series(clusters).rank(method="dense") / 5.0 * 1000).astype(int)


In [25]:
# Train ML model
model = RandomForestRegressor(random_state=42)
model.fit(X, features['simulated_score'])

In [33]:
features['credit_score'] = model.predict(X).clip(0, 1000).astype(int)

print(features[['wallet', 'credit_score']].head(10))

                                       wallet  credit_score
0  0x00000000001accfa9cef68cf5371a23025b6d4b6           800
1  0x000000000051d07a4fb3bd10121a343d85818da6           800
2  0x000000000096026fb41fc39f9875d164bd82e2dc           800
3  0x0000000000e189dd664b9ab08a33c4839953852c          1000
4  0x0000000002032370b971dabd36d72f3e5a7bf1ee           400
5  0x000000000a38444e0a6e37d3b630d7e855a7cb13           596
6  0x000000003853fcedcd0355fec98ca3192833f00b           584
7  0x000000003ce0cf2c037493b1dc087204bd7f713e           400
8  0x000000007858e6f2668e1e06111cfa24403a5466           800
9  0x00000001a0f57e850c9db68b4a9bc34677437c5c           800


In [31]:
# Save to CSV
features[['wallet', 'credit_score']].to_csv("ml_wallet_credit_scores.csv", index=False)

print("✅ ML-based credit scores saved to 'ml_wallet_credit_scores.csv'")

✅ ML-based credit scores saved to 'ml_wallet_credit_scores.csv'
