# Redis Click Stream → XGBoost Demo

Steps:
1. Connect to local Redis (service name `redis`).
2. Generate synthetic click records (logic adapted from `fastapi/generator.py`).
3. Store each record in Redis as a JSON document (`click:<index>`).
4. Read records back from Redis and build a DataFrame.
5. Train an XGBoost classifier to predict `y_fraud`.
6. Display evaluation metrics and feature importances.

**Note:** The original `generator.py` isn't mounted inside the Jupyter container (not in docker-compose volumes). We inline a trimmed, parameterized version of its logic.

In [18]:
# Install needed packages (idempotent).
import sys, subprocess, importlib
def ensure(pkg):
    try: importlib.import_module(pkg)
    except ImportError: subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
for p in ['redis', 'xgboost', 'pandas', 'numpy', 'scikit-learn']:
    ensure(p)
print('Packages ready.')

Packages ready.


In [19]:
# Fix missing matplotlib
%pip install matplotlib

# Imports & configuration
import os, math, random, string, ipaddress, uuid, numpy as np, pandas as pd, time, json
from datetime import datetime, timedelta, timezone
from redis import Redis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt
random.seed(7); np.random.seed(7)
REDIS_HOST = os.getenv('REDIS_HOST','redis')
REDIS_PORT = int(os.getenv('REDIS_PORT','6379'))

Note: you may need to restart the kernel to use updated packages.


In [20]:
import os, socket
from redis import Redis

CANDIDATE_HOSTS = [
    os.getenv("REDIS_HOST", "redis"),
    "localhost",
    "127.0.0.1"
]

redis_host = None
for h in CANDIDATE_HOSTS:
    try:
        socket.gethostbyname(h)
        r = Redis(host=h, port=6379, decode_responses=True, socket_connect_timeout=2)
        if r.ping():
            redis_host = h
            print(f"Connected to Redis via host '{h}'")
            break
    except Exception:
        continue

if not redis_host:
    raise SystemExit("Could not connect to Redis using any host in: " + ", ".join(CANDIDATE_HOSTS))

Connected to Redis via host 'localhost'


## Data Generation
Trimmed synthetic clickstream similar to the original script. Parameters below let you reduce size for quick iteration.

In [None]:
# Parameter knobs (high-volume defaults to reach ~100k rows). Adjust as needed.
DAYS = 20                 # base days to simulate first (can be extended automatically)
HUMANS_PER_DAY = 2000     # expected mean humans per day
BOTS_PER_DAY = 3000       # expected mean bots per day (total across all farms)
N_FARMS = 3
VALUE_PER_CONV = 300.0
TARGET_ROWS = 100_000     # If not None, keep simulating extra days until >= this many rows
MAX_EXTRA_DAYS = 30       # Safety cap to avoid runaway generation

# Helper utils adapted from generator.py
def wchoice(pairs):
    items, weights = zip(*pairs)
    return random.choices(items, weights=weights, k=1)[0]

QUERY_CATEGORIES = [("navigational", 0.28), ("informational", 0.44), ("transactional", 0.23), ("local", 0.05)]
DEVICES = [("desktop", 0.46), ("mobile", 0.47), ("tablet", 0.07)]
COUNTRIES = [("US",0.46),("GB",0.10),("CA",0.09),("AU",0.06),("IN",0.12),("DE",0.06),("FR",0.05),("BR",0.04),("JP",0.02)]
LANGS = [("en",0.55),("en-US",0.20),("en-GB",0.10),("de",0.04),("fr",0.04),("pt",0.03),("hi",0.02),("ja",0.01),("es",0.01)]
BROWSERS = [("Chrome",0.62),("Safari",0.18),("Edge",0.10),("Firefox",0.08),("Other",0.02)]
OSES = [("Windows",0.35),("Android",0.32),("iOS",0.20),("macOS",0.10),("Linux",0.03)]
SERP_FEATURES = ["none","sitelinks","faq","video","image","news","localpack","shopping"]
CAMPAIGNS = ["Brand","Generic","Competitor","Retargeting","DisplayProspecting"]
ADGROUPS = {
    "Brand":["Brand Core","Brand Location"],
    "Generic":["Apartments City","Amenities","Near Me"],
    "Competitor":["Comp A","Comp B"],
    "Retargeting":["Visitors 7d","Abandoned Tours"],
    "DisplayProspecting":["In-Market","Affinity Home"]
}

def random_query(category):
    topics = {
        "navigational": ["facebook login","youtube","gmail","amazon","wikipedia","bank","official site"],
        "informational": ["how to","what is","best way to","benefits of","guide to","tutorial"],
        "transactional": ["buy","price","discount","near me","coupon","best"],
        "local": ["near me","closest","open now","hours","directions","map"]
    }
    nouns = ["apartments","redis","python","laptop","headphones","router","ssd","credit card","insurance","pizza","coffee","gym"]
    t = random.choice(topics[category]); n = random.choice(nouns)
    if category in ("navigational","local"): return f"{n} {t}"
    return f"{t} {n}"

def positional_ctr(position):
    base = 0.38 * (1 / math.log(position + 1.8))
    noise = random.uniform(-0.03, 0.03)
    return max(0.0, min(0.8, base + noise))

def build_queries(n=600):  # more candidate queries for bigger dataset
    cats = [wchoice(QUERY_CATEGORIES) for _ in range(n)]
    q = [random_query(c) for c in cats]
    q = list(dict.fromkeys(q))
    idx = np.arange(1, len(q)+1)
    zipf_w = (1 / (idx ** 1.1)); zipf_w /= zipf_w.sum()
    return q, zipf_w

def pick_campaign():
    c = random.choice(CAMPAIGNS)
    g = random.choice(ADGROUPS[c])
    ad_id = random.randint(100000, 999999)
    creative = random.choice(["RSA","Text","Display","Video"])
    return c, g, ad_id, creative

def sample_bot_farms(n=2):
    farms = []
    for i in range(n):
        farms.append({
            'name': f'farm{i+1}',
            'blocks': [f'52.{23+i}.{x}.0/24' for x in (10,20,30)],
            'active_hours': sorted(random.sample(range(24), k=10)),
            'ctr_multiplier': 0.6,
            'cvr_multiplier': 0.05,
            'bounce_bias': 0.4,
            'headless_rate': 0.5
        })
    return farms

def ipv4_from_block(block_cidr):
    net = ipaddress.IPv4Network(block_cidr)
    host = int(net.network_address) + random.randint(1, net.num_addresses-2)
    return str(ipaddress.IPv4Address(host))

def sample_bot_ip(farm):
    block = random.choice(farm['blocks'])
    ip = ipv4_from_block(block)
    return ip, block

def _simulate_one_day(day, queries, zipf_w, farms, BRAND_TERMS, ranks, rank_w, rows, humans_per_day, bots_per_day):
    # Humans
    n_h = np.random.poisson(humans_per_day)
    for _ in range(n_h):
        hour = random.randint(0,23)
        dev = wchoice(DEVICES); country = wchoice(COUNTRIES); lang = wchoice(LANGS)
        browser = wchoice(BROWSERS); os_name = wchoice(OSES)
        q = random.choices(queries, weights=zipf_w, k=1)[0]
        cat = ('transactional' if any(x in q for x in ['buy','price','discount','coupon']) else
               'local' if any(x in q for x in ['near me','open now','hours','directions']) else
               'navigational' if any(x in q for x in ['login','official','site']) else 'informational')
        pos = int(np.random.choice(ranks, p=rank_w))
        impr = max(1, int(np.random.lognormal(mean=1.3, sigma=0.5)))
        base_ctr = positional_ctr(pos)
        if dev=='mobile': base_ctr *= 0.95
        is_brand = (q in BRAND_TERMS) or ('official' in q) or ('login' in q)
        if is_brand: base_ctr = min(0.85, base_ctr*1.3)
        clicks = np.random.binomial(impr, max(0, min(0.95, base_ctr)))
        base_cvr = {'navigational':0.11,'informational':0.035,'transactional':0.08,'local':0.05}[cat]
        conv = int(np.random.binomial(clicks, base_cvr)) if clicks>0 else 0
        campaign, adgroup, ad_id, creative = pick_campaign()
        cpc = max(0.1, np.random.normal(0.9, 0.3))
        cost = round(cpc*clicks,2); revenue = round(conv*VALUE_PER_CONV*np.random.uniform(0.9,1.1),2)
        dwell = 0 if clicks==0 else int(max(3, np.random.normal(60,20)))
        bounce = (dwell < 30 and random.random()<0.4)
        ts = datetime(day.year, day.month, day.day, hour, random.randint(0,59), random.randint(0,59), tzinfo=timezone.utc)
        rows.append({
            'event_ts': int(ts.timestamp()*1000),
            'event_date': str(day),
            'hour_of_day': hour,
            'source_type': 'human',
            'farm': None,
            'search_query': q,
            'query_intent_category': cat,
            'is_brand_query': bool(is_brand),
            'rank_position': pos,
            'serp_impressions': impr,
            'serp_clicks': clicks,
            'click_through_rate': round(clicks/impr,4),
            'device_type': dev,
            'os': os_name,
            'browser': browser,
            'user_country': country,
            'user_language': lang,
            'campaign': campaign,
            'ad_group': adgroup,
            'ad_id': ad_id,
            'creative_type': creative,
            'dwell_time_seconds': dwell,
            'bounced_session': bool(bounce),
            'conversions': conv,
            'cpc': float(round(cpc,2)),
            'cost': float(cost),
            'revenue': float(revenue),
            'ip': None,
            'ip_subnet24': None,
            'isp_class': 'residential',
            'headless': False,
            'y_fraud': 0,
            'y_conv': 1 if conv>0 else 0
        })
    # Bots
    for farm in farms:
        n_b = np.random.poisson(bots_per_day / max(1,len(farms)))
        for _ in range(n_b):
            hour = random.choice(farm['active_hours'])
            dev = wchoice(DEVICES); country = wchoice(COUNTRIES); lang = wchoice(LANGS)
            q = random.choices(queries, weights=zipf_w, k=1)[0]
            cat = ('transactional' if any(x in q for x in ['buy','price','discount','coupon']) else
                   'local' if any(x in q for x in ['near me','open now','hours','directions']) else
                   'navigational' if any(x in q for x in ['login','official','site']) else 'informational')
            pos = int(np.random.choice(ranks, p=rank_w))
            impr = max(1, int(np.random.lognormal(mean=1.5, sigma=0.6)))
            base_ctr = positional_ctr(pos) * 0.5
            clicks = np.random.binomial(impr, max(0,min(0.9, base_ctr)))
            conv = 0  # negligible
            campaign, adgroup, ad_id, creative = pick_campaign()
            cpc = max(0.05, np.random.normal(0.4,0.15))
            cost = round(cpc*clicks,2); revenue = 0.0
            ip, subnet = sample_bot_ip(farm)
            dwell = 0 if clicks==0 else int(max(1, np.random.normal(8,4)))
            bounce = True if dwell < 5 else False
            ts = datetime(day.year, day.month, day.day, hour, random.randint(0,59), random.randint(0,59), tzinfo=timezone.utc)
            rows.append({
                'event_ts': int(ts.timestamp()*1000),
                'event_date': str(day),
                'hour_of_day': hour,
                'source_type': 'bot',
                'farm': farm['name'],
                'search_query': q,
                'query_intent_category': cat,
                'is_brand_query': ('official' in q) or ('login' in q),
                'rank_position': pos,
                'serp_impressions': impr,
                'serp_clicks': clicks,
                'click_through_rate': round(clicks/impr,4),
                'device_type': dev,
                'os': wchoice(OSES)[0] if isinstance(wchoice(OSES), tuple) else dev,
                'browser': 'Chrome',
                'user_country': country,
                'user_language': lang,
                'campaign': campaign,
                'ad_group': adgroup,
                'ad_id': ad_id,
                'creative_type': creative,
                'dwell_time_seconds': dwell,
                'bounced_session': bool(bounce),
                'conversions': conv,
                'cpc': float(round(cpc,2)),
                'cost': float(cost),
                'revenue': float(revenue),
                'ip': ip,
                'ip_subnet24': subnet,
                'isp_class': 'datacenter',
                'headless': True,
                'y_fraud': 1,
                'y_conv': 0
            })

def generate_dataset(days=DAYS, humans_per_day=HUMANS_PER_DAY, bots_per_day=BOTS_PER_DAY, n_farms=N_FARMS, target_rows: int | None = TARGET_ROWS):
    queries, zipf_w = build_queries()
    BRAND_TERMS = set(q for q in queries if 'official' in q or 'login' in q or 'gmail' in q)
    ranks = np.arange(1, 31)
    rank_w = np.linspace(0.3,0.7,len(ranks))[::-1]; rank_w /= rank_w.sum()
    farms = sample_bot_farms(n_farms)
    rows = []
    start_day = (datetime.utcnow() - timedelta(days=days-1)).date()
    # Initial fixed window
    for d_off in range(days):
        day = start_day + timedelta(days=d_off)
        _simulate_one_day(day, queries, zipf_w, farms, BRAND_TERMS, ranks, rank_w, rows, humans_per_day, bots_per_day)
    extra_days = 0
    while target_rows and len(rows) < target_rows and extra_days < MAX_EXTRA_DAYS:
        day = start_day + timedelta(days=days + extra_days)
        _simulate_one_day(day, queries, zipf_w, farms, BRAND_TERMS, ranks, rank_w, rows, humans_per_day, bots_per_day)
        extra_days += 1
        if extra_days % 2 == 0:
            print(f"Extended generation: {extra_days} extra days, rows so far {len(rows):,}")
    if target_rows and len(rows) < target_rows:
        print(f"WARNING: Reached MAX_EXTRA_DAYS ({MAX_EXTRA_DAYS}) with only {len(rows):,} rows < target {target_rows:,}")
    df = pd.DataFrame(rows).sort_values('event_ts').reset_index(drop=True)
    df['ctr'] = df['click_through_rate']
    df['cvr'] = (df['conversions'] / df['serp_clicks'].replace(0, np.nan)).fillna(0.0)
    df['roas'] = (df['revenue'] / df['cost'].replace(0, np.nan)).replace([np.inf,-np.inf],0).fillna(0.0)
    print(f"Generated rows: {len(df):,} across {df['event_date'].nunique()} days (base {days} + extra {extra_days})")
    return df

df = generate_dataset()
print('Final dataset size:', len(df))
df.head()

  start_day = (datetime.utcnow() - timedelta(days=days-1)).date()


Generated rows: 5013


Unnamed: 0,event_ts,event_date,hour_of_day,source_type,farm,search_query,query_intent_category,is_brand_query,rank_position,serp_impressions,...,revenue,ip,ip_subnet24,isp_class,headless,y_fraud,y_conv,ctr,cvr,roas
0,1755475267000,2025-08-18,0,human,,discount gym,transactional,False,22,4,...,0.0,,,residential,False,0,0,0.25,0.0,0.0
1,1755475440000,2025-08-18,0,bot,farm1,insurance amazon,informational,False,18,2,...,0.0,52.23.10.61,52.23.10.0/24,datacenter,True,1,0,0.0,0.0,0.0
2,1755475476000,2025-08-18,0,bot,farm1,gym wikipedia,informational,False,2,5,...,0.0,52.23.30.96,52.23.30.0/24,datacenter,True,1,0,0.4,0.0,0.0
3,1755475543000,2025-08-18,0,bot,farm1,tutorial coffee,informational,False,13,2,...,0.0,52.23.30.80,52.23.30.0/24,datacenter,True,1,0,0.0,0.0,0.0
4,1755475550000,2025-08-18,0,human,,credit card amazon,informational,False,26,9,...,0.0,,,residential,False,0,0,0.1111,0.0,0.0


In [24]:
# DEBUG: Diagnose unexpectedly low row count
print('--- Generation Debug ---')
print('Configured DAYS:', DAYS, 'HUMANS_PER_DAY:', HUMANS_PER_DAY, 'BOTS_PER_DAY:', BOTS_PER_DAY, 'N_FARMS:', N_FARMS)
print('Actual df rows:', len(df))
if len(df) < 50:
    # Show per-day counts
    if 'event_date' in df.columns:
        print('Per-day row counts:')
        print(df.groupby('event_date').size())
    # Show first few unique queries and source types
    print('Source type counts:')
    print(df['source_type'].value_counts())
    # Inspect sample of raw rows
    print('Sample rows dicts:')
    for i, rec in df.head(5).iterrows():
        print(rec.to_dict())
    # Re-run a manual Poisson draw to confirm RNG
    import numpy as _np
    print('Sample Poisson humans draw (lambda=HUMANS_PER_DAY):', _np.random.poisson(HUMANS_PER_DAY))
else:
    print('Row count looks reasonable; no anomaly detected.')
print('------------------------')

--- Generation Debug ---
Configured DAYS: 5 HUMANS_PER_DAY: 400 BOTS_PER_DAY: 600 N_FARMS: 2
Actual df rows: 5013
Row count looks reasonable; no anomaly detected.
------------------------


## Write Records to Redis as JSON
Each row → key `click:<row_index>` (JSON).

In [None]:
# Always clear previous click:* keys to start fresh (fast SCAN + UNLINK)
import time as _t
prefix = 'click:'
start_clear = _t.time()
pipe_del = r.pipeline(transaction=False)
count_del = 0
for k in r.scan_iter(f"{prefix}*", count=1000):
    pipe_del.unlink(k)  # non-blocking delete
    count_del += 1
    if count_del % 10000 == 0:
        pipe_del.execute()
        print(f"Queued deletions: {count_del:,}...")
# flush remaining deletions
pipe_del.execute()
print(f"Cleared {count_del:,} existing keys in {(_t.time()-start_clear):.2f}s")

In [None]:
# Verify DB is empty for click:* before insertion
remaining = sum(1 for _ in r.scan_iter('click:*'))
print('Remaining click:* keys after clear:', remaining)
assert remaining == 0, 'Expected zero click:* keys before insertion.'

In [None]:
import json, time
BATCH_SIZE = 5000  # larger batch for speed
prefix = "click:"

start = time.time()
pipe = r.pipeline(transaction=False)
queued = 0
inserted = 0
TOTAL = len(df)
log_every = 20000
for i, row in df.iterrows():
    pipe.set(f"{prefix}{i}", json.dumps(row.to_dict(), separators=(',',':')))
    queued += 1
    if queued % BATCH_SIZE == 0:
        res = pipe.execute(); inserted += len(res)
        # Lightweight progress
        if inserted % log_every == 0 or inserted == TOTAL:
            pct = (inserted / TOTAL)*100
            print(f"Inserted {inserted:,}/{TOTAL:,} ({pct:5.1f}%)")
# flush remainder
if queued % BATCH_SIZE:
    res = pipe.execute(); inserted += len(res)

elapsed = time.time() - start
print(f"Inserted {inserted:,} keys in {elapsed:.2f}s ({inserted/elapsed:.1f} keys/sec)")

# Safe example fetch
example = r.get(f"{prefix}0")
if example:
    print("Example key value snippet:", example[:140], "...")
else:
    print("Key click:0 not found; first available key:",
          next(r.scan_iter(f"{prefix}*"), "NONE"))

Flushed 1,000 keys...
Flushed 2,000 keys...
Flushed 3,000 keys...
Flushed 4,000 keys...
Inserted 4,965 keys in 0.43s (11541.9 keys/sec)
Key click:0 not found; first available key: click:4573
Flushed 4,000 keys...
Inserted 4,965 keys in 0.43s (11541.9 keys/sec)
Key click:0 not found; first available key: click:4573


In [None]:
# Diagnostics: reconcile DataFrame rows vs Redis keys
import socket
expected = len(df)
print('Expected rows (len(df)):', expected)
print('Inserted counter variable:', inserted)

# Count keys via pattern scan (efficient incremental)
click_key_count = 0
for _ in r.scan_iter(f"{prefix}*", count=1000):
    click_key_count += 1
print('Discovered click:* keys via scan_iter:', click_key_count)

# Total keys in DB
print('DBSIZE (all keys):', r.dbsize())

# Sample a few missing indices if discrepancy
if click_key_count < expected:
    missing = []
    # check first 50 indices only for speed
    for i in range(min(50, expected)):
        if not r.exists(f"{prefix}{i}"):
            missing.append(i)
    print('Example missing early indices (first 50 checked):', missing[:10])

# Show server identity to ensure we connected where we inserted
info = r.info()
print('Connected server:', info.get('redis_version'), 'mode', info.get('redis_mode'), 'port', info.get('tcp_port'))
print('Server role:', info.get('role'), 'process id:', info.get('process_id'))
print('Instance run id:', info.get('run_id'))
print('Current client name (if any):', r.client_info().get('name'))

# Heuristic: warn if inserted << expected
if inserted and expected and inserted < expected:
    print('WARNING: pipeline reported fewer replies than DataFrame rows. Some commands may not have executed or were overwritten.')

# Simple integrity check on a random sample of keys
import random, json as _json
if click_key_count:
    sample_indices = random.sample(range(min(expected, click_key_count)), k=min(3, click_key_count))
    for idx in sample_indices:
        raw = r.get(f"{prefix}{idx}")
        print(f'Sample key click:{idx} present:', bool(raw), 'size:', len(raw) if raw else 0)
        if raw:
            try:
                rec = _json.loads(raw)
                print('  Fields:', list(rec.keys())[:8], '...')
            except Exception as e:
                print('  JSON decode error:', e)
else:
    print('No click:* keys found in Redis.')

## Read Back from Redis
Scan keys with prefix `click:` and rebuild the DataFrame.

In [17]:
def load_clicks(prefix='click:'):
    cursor = 0; rows=[]
    while True:
        cursor, keys = r.scan(cursor=cursor, match=f'{prefix}*', count=500)
        for k in keys:
            raw = r.get(k)
            if raw:
                try: rows.append(json.loads(raw))
                except json.JSONDecodeError: pass
        if cursor == 0: break
    return pd.DataFrame(rows)

df_loaded = load_clicks()
print('Loaded rows:', len(df_loaded))
df_loaded.head()

Loaded rows: 0


## Feature Engineering & Train/Test Split
We'll predict `y_fraud` using a subset of features and simple one-hot encoding.

In [None]:
target = 'y_fraud'
# Derive label if missing (e.g. if not persisted / older write missing column)
if target not in df_loaded.columns:
    print(f"Target column '{target}' missing; deriving from source_type=='bot'.")
    if 'source_type' not in df_loaded.columns:
        raise ValueError("Cannot derive y_fraud because 'source_type' column is absent and original target missing.")
    df_loaded[target] = (df_loaded['source_type'] == 'bot').astype(int)

if df_loaded.empty:
    raise ValueError("Loaded DataFrame is empty. Ensure Redis keys were inserted and load_clicks() worked.")

# Clean / reset index to avoid surprises
use_df = df_loaded.reset_index(drop=True).copy()

# Drop high-cardinality or unused columns (ignore if absent)
drop_cols = ['search_query','event_date','ip','ip_subnet24']
use_df = use_df.drop(columns=[c for c in drop_cols if c in use_df.columns])

# Ensure target exists now
if target not in use_df.columns:
    raise KeyError(f"Target column '{target}' still not present after preparation.")

categoricals = ['source_type','query_intent_category','device_type','user_country','campaign']
for c in categoricals:
    if c in use_df.columns:
        use_df[c] = use_df[c].astype('category')

# Separate X / y
y = use_df[target].astype(int)
X = use_df.drop(columns=[target])

# One-hot encode categoricals (only those present)
X = pd.get_dummies(X, drop_first=True)

# Basic sanity checks
print('Feature matrix shape before split:', X.shape)
print('Class balance:', y.value_counts(normalize=True).to_dict())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y if y.nunique()>1 else None, random_state=42
)
X.shape, X_train.shape

KeyError: "['y_fraud'] not found in axis"

## Train XGBoost Classifier

In [None]:
clf = xgb.XGBClassifier(
    n_estimators=160,
    learning_rate=0.12,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    random_state=42
)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, pred))

## Feature Importance

In [None]:
importances = clf.feature_importances_
imp = pd.DataFrame({'feature': X.columns, 'importance': importances}).sort_values('importance', ascending=False).head(25)
plt.figure(figsize=(8,6))
plt.barh(imp['feature'][::-1], imp['importance'][::-1])
plt.title('Top 25 Feature Importances')
plt.tight_layout()
plt.show()
imp.head()

## Summary
- Generated synthetic human + bot clickstream data.
- Stored each record in Redis as JSON (string values).
- Reloaded records and trained an XGBoost fraud classifier.
- Displayed metrics and feature importances.

You can tweak DAYS / HUMANS_PER_DAY / BOTS_PER_DAY and re-run.