# New Outcome Variables — Colab Pipeline

Constructs markup decomposition, OP covariance, dispersion, and concentration
measures from cleaned Orbis parquets on Google Drive.

**Workflow:** Mount Drive → Clone repo → Run pipeline → CSVs saved to Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone repo (first time) or pull updates
import os
REPO_DIR = '/content/pipeline'
if os.path.exists(REPO_DIR):
    !cd {REPO_DIR} && git pull
else:
    !git clone https://github.com/zhixiwei/a1-colab-pipeline.git {REPO_DIR}

In [None]:
# Install dependencies
!pip install -q polars pyarrow

In [None]:
# --- SMOKE TEST: run on one parquet file first ---
import sys
sys.path.insert(0, REPO_DIR)

from pipeline import run_all
results = run_all(test=True, save=False)

In [None]:
# Quick summary of each outcome
for name, df in results.items():
    print(f"\n{'='*60}")
    print(f"{name}: {df.height} rows × {df.width} cols")
    print(df.head(5))
    print(df.describe())

In [None]:
# Sanity checks
import polars as pl

# 1. OP covariance should be positive on average
opcov = results['op_covariance']
for col in [c for c in opcov.columns if c.startswith('LD_')]:
    print(f"{col}: mean = {opcov[col].mean():.4f}, std = {opcov[col].std():.4f}")

# 2. Markup levels (from firm panel)
markup = results['markup_decomp']
for col in [c for c in markup.columns if 'Within' in c]:
    print(f"{col}: mean = {markup[col].mean():.4f}")

In [None]:
# Country coverage check
for name, df in results.items():
    countries = df['fic_code'].unique().sort().to_list()
    print(f"{name}: {len(countries)} countries — {countries}")

In [None]:
# --- FULL RUN: uncomment when smoke test passes ---
# from importlib import reload
# import pipeline; reload(pipeline)
# results = pipeline.run_all(test=False, save=True)