
# FX Tick Data Provider Evaluation

## Objective
Evaluate three FX tick data providers (**A, B, C**) and select the most appropriate provider
based on **data quality, pricing efficiency, liquidity, and reliability**.

## Dataset Location


In [None]:

import pandas as pd
import numpy as np
import glob
from pathlib import Path


## Load Tick Data

In [None]:

DATA_DIR = Path("/Users/zhe.chen/Downloads/AT_take_home_exercise/tick_data/date=2024-03-01")
files = glob.glob(str(DATA_DIR / "*.csv.gz"))

dfs = []
for f in files:
    provider = f.split("_")[-1].replace(".csv.gz", "")
    df = pd.read_csv(f, compression="gzip", parse_dates=["datetime"])
    df["provider"] = provider
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data.head()


## Dataset Coverage

In [None]:

coverage = (
    data.groupby("provider")
    .agg(
        ticks=("datetime", "count"),
        currency_pairs=("currency_pair", "nunique"),
        start_time=("datetime", "min"),
        end_time=("datetime", "max")
    )
)
coverage


## Data Quality Assessment

In [None]:

# Missing values per provider
missing_values = (
    data.isna()
    .groupby(data["provider"])
    .sum()
    .sum(axis=1)
)

# Duplicate rows per provider
duplicate_rows = (
    data.duplicated()
    .groupby(data["provider"])
    .sum()
)

# Invalid bid >= ask per provider
invalid_bid_ask = (
    (data["bid"] >= data["ask"])
    .groupby(data["provider"])
    .sum()
)

quality = pd.concat(
    [missing_values, duplicate_rows, invalid_bid_ask],
    axis=1
)

quality.columns = [
    "missing_values",
    "duplicate_rows",
    "invalid_bid_ask"
]

quality


## Spread & Liquidity Analysis

In [None]:

data["spread"] = data["ask"] - data["bid"]

spread_stats = data.groupby("provider").agg(
    avg_spread=("spread", "mean"),
    spread_std=("spread", "std"),
    avg_volume=("volume", "mean"),
    volume_std=("volume", "std")
)

spread_stats


## Price Stability (Outlier Detection)

In [None]:

data = data.sort_values(["provider", "currency_pair", "datetime"])
data["mid"] = (data["bid"] + data["ask"]) / 2

data["abs_return"] = data.groupby(
    ["provider", "currency_pair"]
)["mid"].pct_change(fill_method=None).abs()

outliers = data.groupby("provider")["abs_return"].apply(
    lambda x: (x > x.quantile(0.999)).sum()
)

outliers


## Composite Provider Scoring

In [None]:

score = (
    spread_stats
    .join(quality)
    .assign(
        spread_score=lambda x: 1 / x["avg_spread"],
        volume_score=lambda x: x["avg_volume"],
        penalty=lambda x: (
            x["missing_values"]
            + x["duplicate_rows"]
            + x["invalid_bid_ask"]
        )
    )
)

score["final_score"] = (
    score["spread_score"]
    + score["volume_score"] * 1e-6
    - score["penalty"] * 1e-3
)

score.sort_values("final_score", ascending=False)


## Conclusion & Provider Selection

In [None]:

best_provider = score["final_score"].idxmax()

print(f"The best data provider is {best_provider}.")

score.loc[[best_provider]]



### Final Decision

**The best data provider is the one with the highest composite score**, reflecting:

- Tight and stable bid-ask spreads  
- Strong and consistent trading volume  
- Minimal data quality issues  
- Fewer extreme price jumps  

This provider offers the best balance of **execution realism, data cleanliness,
and analytical reliability**, making it the most suitable choice for trading
research, backtesting, and production analytics.
