# Redis Click Dataset → HistGradientBoosting Pipeline
End-to-end workflow: generate synthetic click data (via FastAPI endpoint), populate Redis, load into a DataFrame, train an **HistGradientBoostingRegressor** to predict `serp_clicks`, evaluate, and inspect feature importances.

## 1. Configuration
Adjust parameters below. Set `GENERATE = True` to trigger FastAPI to create & (optionally) insert data into Redis. 
If you've already generated data you can set it to False and just reload from Redis.

In [None]:
# Parameters
ROWS = 5000            # number of synthetic rows to generate
DAYS = 30              # history window
SEED = 1337            # RNG seed for reproducibility
LEGACY = 0             # 0 = verbose schema, 1 = legacy short schema
TO_REDIS = 1           # 1 = also push rows into Redis hashes (click:*)
GENERATE = True        # set False to skip regeneration
FASTAPI_BASE = 'http://localhost:8000'  # FastAPI service URL
REDIS_HOST = 'redis'   # service hostname inside docker network
REDIS_PORT = 6379
TARGET = 'serp_clicks' # prediction target (for verbose schema)
KEY_PREFIX = 'click:'  # Redis key prefix used by generator endpoint
MAX_KEYS = None        # None for all, or set an int cap for faster experiments

## 2. Install/Import Dependencies
Install lightweight dependencies that might be missing (redis). The SciPy notebook image already has scikit-learn & pandas.

In [None]:
%pip -q install redis requests joblib > /dev/null
import requests, json, math, time, os
import redis as redis_lib
import numpy as np, pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
print('Imports ready.')

## 3. Trigger Dataset Generation via FastAPI (Optional)
Calls the `/generate_clicks` endpoint that reuses the shared generator logic and (optionally) inserts into Redis.

In [None]:
if GENERATE:
    params = {
        'rows': ROWS, 'days': DAYS, 'seed': SEED, 'legacy': LEGACY, 'to_redis': TO_REDIS
    }
    url = f'{FASTAPI_BASE}/generate_clicks'
    print('POST', url, params)
    resp = requests.post(url, params=params, timeout=300)
    if resp.status_code != 200:
        raise RuntimeError(f'Generation failed: {resp.status_code} {resp.text}')
    gen_info = resp.json()
    print('Generated rows:', gen_info.get('rows'), 'CSV:', gen_info.get('path'), 'Inserted:', gen_info.get('redis_inserted'))
else:
    print('Skipping generation (GENERATE=False).')

## 4. Load Rows from Redis
Scans Redis for keys with the chosen prefix and builds a DataFrame. For very large sets you can cap with `MAX_KEYS`.

In [None]:
r = redis_lib.Redis(host=REDIS_HOST, port=REDIS_PORT, decode_responses=True)
rows = []
count = 0
cursor = 0
pattern = f'{KEY_PREFIX}*'
while True:
    cursor, keys = r.scan(cursor=cursor, match=pattern, count=1000)
    for k in keys:
        h = r.hgetall(k)
        if h:
            rows.append(h)
            count += 1
            if MAX_KEYS and count >= MAX_KEYS:
                cursor = 0
                break
    if cursor == 0:
        break
print(f'Fetched {len(rows)} hash rows from Redis.')
if not rows:
    raise ValueError('No click data found in Redis. Ensure generation ran and TO_REDIS=1.')
df = pd.DataFrame(rows)
print('Columns:', df.columns.tolist()[:12], '... total', len(df.columns))
df.head()

## 5. Prepare Features / Target
We coerce numeric-looking columns, drop obvious high-cardinality identifiers, and select a target.
If legacy schema was used adjust TARGET accordingly (e.g. `clicks`).

In [None]:
# Convert numeric candidates
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='ignore')
# Determine target name for legacy schema if needed
if LEGACY == 1 and TARGET == 'serp_clicks':
    TARGET = 'clicks'
if TARGET not in df.columns:
    raise KeyError(f'Target {TARGET} not in DataFrame columns')
id_like = {'user_id','session_id','landing_page_url','page_title','search_query','query','page'}
exclude = id_like.union({TARGET})
num_df = df.drop(columns=[c for c in df.columns if c in exclude], errors='ignore')
X = num_df.select_dtypes(include=['number']).fillna(0)
y = df[TARGET].astype(float)
print('Feature cols:', len(X.columns))
X.head()

## 6. Train / Evaluate HistGradientBoostingRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = HistGradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)
print(f'RMSE: {rmse:.4f}  R2: {r2:.4f}')
model_path = '/shared/models/hgbr_model.joblib'
os.makedirs('/shared/models', exist_ok=True)
joblib.dump({'model': model, 'features': list(X.columns), 'target': TARGET}, model_path)
print('Saved model ->', model_path)

## 7. Feature Importance
HistGradientBoosting exposes `feature_importances_` for interpretability.

In [None]:
importances = getattr(model, 'feature_importances_', None)
if importances is None:
    raise AttributeError('Model has no feature_importances_')
fi = (pd.Series(importances, index=X.columns)
        .sort_values(ascending=False)
        .head(20))
plt.figure(figsize=(8,6))
sns.barplot(x=fi.values, y=fi.index, orient='h')
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
fi.to_frame('importance').head()

## 8. Quick Prediction Demo
Take a small sample of test rows and compare predicted vs actual values.

In [None]:
sample = X_test.head(5).copy()
pred_sample = model.predict(sample)
compare = pd.DataFrame({
    'actual': y_test.head(5).values,
    'predicted': pred_sample
})
compare

---
### Finished
You can adjust parameters and rerun cells 3 onward to iterate quickly.