In [142]:
import pandas as pd
import numpy as np
import re
import os
from fredapi import Fred

# Configuration FRED (Federal Reserve Economic Data)
FRED_API_KEY = os.getenv("FRED_API_KEY", "cb55babd6bab68262e9308fe1bf27976")
fred = Fred(api_key=FRED_API_KEY)

# Paramètres globaux
DEBUT = "1999-01-31"
FIN = "2025-01-31"
PAYS_CODES = ["US", "JP", "CH", "AU", "CN", "CA"]

# Chargement données FX
df_fx = pd.read_csv("data/devise.csv.gz")
df_fx["date"] = pd.to_datetime(df_fx["date"])
df_fx = df_fx.sort_values("date")

print(df_fx.head())


        date    exalus  excaus  exchus    exjpus  exszus
0 1999-01-31  1.582278  1.5194  8.2789  113.2900  1.3856
1 1999-02-28  1.562744  1.4977  8.2781  116.6684  1.4272
2 1999-03-31  1.585289  1.5176  8.2792  119.4730  1.4660
3 1999-04-30  1.557632  1.4881  8.2792  119.7723  1.4971
4 1999-05-31  1.508751  1.4611  8.2785  121.9995  1.5078


In [143]:
#  Téléchargement et préparation taux BIS (CORRIGÉ)
url_cbpol = "https://data.bis.org/static/bulk/WS_CBPOL_csv_col.zip"
cb = pd.read_csv(url_cbpol, compression="zip", low_memory=False)

# Nettoyage colonnes
cb.columns = cb.columns.str.strip()
date_cols = [c for c in cb.columns if re.fullmatch(r"\d{4}-\d{2}-\d{2}", c)]

if not date_cols:
    raise ValueError("Aucune colonne de type YYYY-MM-DD trouvée dans BIS")

# Extraction code pays
cb["ref_code"] = cb["REF_AREA"].astype(str).str.split(":").str[0].str.strip()

# Filtrage pays et fréquence quotidienne
cb_filt = cb[(cb["FREQ"] == "D") & (cb["ref_code"].isin(PAYS_CODES))].copy()

# Transformation wide -> long
cb_long = cb_filt.melt(
    id_vars=["ref_code"],
    value_vars=date_cols,
    var_name="date",
    value_name="rate"
)

cb_long["date"] = pd.to_datetime(cb_long["date"], errors="coerce")
cb_long = cb_long.dropna(subset=["date"])
cb_long = cb_long[cb_long["date"].between(DEBUT, FIN)]

# Pivot quotidien puis agrégation mensuelle
df_taux_daily = cb_long.pivot(index="date", columns="ref_code", values="rate").sort_index()
df_taux_m = df_taux_daily.resample("M").last()

# CORRECTION: Forward-fill pour combler les NaN (taux inchangés)
df_taux_m = df_taux_m.fillna(method='ffill')

# Supprimer les premières lignes si encore des NaN (début de série)
df_taux_m = df_taux_m.dropna()

# Renommage pour clarté
df_taux_m = df_taux_m.rename(columns={
    "US": "USA",
    "JP": "Japan",
    "CH": "Switzerland",
    "AU": "Australia",
    "CN": "China",
    "CA": "Canada"
})


print(f"\n  Aperçu:")
print(df_taux_m.head())
print(df_taux_m.tail())


  Aperçu:
ref_code    Australia  Canada  Switzerland  China  Japan   USA
date                                                          
1999-02-28       4.75    5.00          1.0   6.39   0.25  4.75
1999-03-31       4.75    4.75          1.0   6.39   0.25  4.75
1999-04-30       4.75    4.75          0.5   6.39   0.25  4.75
1999-05-31       4.75    4.50          0.5   6.39   0.25  4.75
1999-06-30       4.75    4.50          0.5   5.85   0.25  5.00
ref_code    Australia  Canada  Switzerland  China  Japan    USA
date                                                           
2024-09-30       4.35    4.25          1.0   3.35   0.25  4.875
2024-10-31       4.35    3.75          1.0   3.10   0.25  4.875
2024-11-30       4.35    3.75          1.0   3.10   0.25  4.625
2024-12-31       4.35    3.25          0.5   3.10   0.25  4.375
2025-01-31       4.35    3.00          0.5   3.10   0.50  4.375


  df_taux_m = df_taux_daily.resample("M").last()
  df_taux_m = df_taux_m.fillna(method='ffill')


In [144]:

#  Téléchargement inflation FRED 
fred_codes = {
    "USA": "CCRETT01USM661N",
    "Switzerland": "CCRETT01CHM661N",
    "Japan": "CCRETT01JPM661N",
    "Canada": "CCRETT01CAM661N",
    "Australia": "CCRETT01AUM661N",
    "China": "CCRETT01CNM661N"
}

df_infl = pd.DataFrame()
ok_codes = []
bad_codes = []

for country, code in fred_codes.items():
    try:
        series = fred.get_series(code)
        series.name = country
        df_infl[country] = series
        ok_codes.append((country, code))
    except Exception as e:
        print(f"⚠ ÉCHEC pour {country} avec code {code}: {e}")
        bad_codes.append((country, code))

print(f"✓ Inflation téléchargée: {len(ok_codes)} séries")

# Nettoyage index et tri
df_infl.index = pd.to_datetime(df_infl.index)
df_infl = df_infl.sort_index()

# Filtrer la période AVANT le calcul YoY
df_infl = df_infl[df_infl.index >= "1998-01-31"]  # 12 mois avant DEBUT pour le YoY

# Calcul inflation YoY (%) - sans warning
df_infl_yoy = df_infl.pct_change(12, fill_method=None) * 100

# Mensuel fin de mois 
df_infl_m = df_infl_yoy.resample("ME").last()

# Ne garder que la période qui nous intéresse
df_infl_m = df_infl_m[df_infl_m.index >= DEBUT]

# Supprimer les NaN (les 12 premiers mois n'ont pas de YoY)
df_infl_m = df_infl_m.dropna()

# Renommer colonnes
df_infl_m = df_infl_m.add_suffix("_infl")

print(df_infl_m.head())
print(df_infl_m.tail())

✓ Inflation téléchargée: 6 séries
            USA_infl  Switzerland_infl  Japan_infl  Canada_infl  \
1999-02-28 -0.314216          0.893717    3.650634    -5.537803   
1999-03-31  1.433010          2.394380    4.972353    -7.321855   
1999-04-30  1.223965          2.810031    9.079594    -4.198071   
1999-05-31  0.205402          1.974614    7.455268    -1.522554   
1999-06-30 -1.579202          1.721053   10.819319    -0.695316   

            Australia_infl  China_infl  
1999-02-28       -8.201451   -4.786128  
1999-03-31       -7.248901   -3.284290  
1999-04-30       -2.798971   -3.105701  
1999-05-31        2.899141   -3.830743  
1999-06-30        4.321533   -5.756429  
            USA_infl  Switzerland_infl  Japan_infl  Canada_infl  \
2025-06-30 -1.647874          1.715577    7.310798    -1.535228   
2025-07-31 -1.702050          3.627858    5.268938    -1.835025   
2025-08-31 -0.505615          0.031497   -1.142370    -2.362394   
2025-09-30 -0.454767          0.289893   -2.77302

In [145]:
#Téléchargement VIX
vix = fred.get_series("VIXCLS")
vix.index = pd.to_datetime(vix.index)
vix.name = "VIX"

# Mensuel fin de mois
vix_m = vix.resample("ME").last()

In [146]:
# Fusion données brutes (FX + Taux + Inflation + VIX)
df_full = (df_fx
    .merge(df_taux_m, left_on="date", right_index=True, how="inner")
    .merge(df_infl_m, left_on="date", right_index=True, how="inner")
    .merge(vix_m, left_on="date", right_index=True, how="left"))

# Tri par date
df_full = df_full.sort_values("date").reset_index(drop=True)


In [147]:

# Calcul log-returns
fx_cols = ["exalus", "excaus", "exchus", "exjpus", "exszus"]

for col in fx_cols:
    df_full[f"logret_{col}"] = np.log(df_full[col]) - np.log(df_full[col].shift(1))



In [148]:
# Calcul carry (différentiel de taux)
rate_cols = ["Japan", "Switzerland", "Australia", "China", "Canada"]

for country in rate_cols:
    df_full[f"carry_{country}_USD"] = df_full[country] - df_full["USA"]



In [149]:
# Calcul momentum 3M et 12M
for col in fx_cols:
    # Momentum 3 mois: log(P_t) - log(P_{t-3})
    df_full[f"mom3_{col}"] = np.log(df_full[col]) - np.log(df_full[col].shift(3))
    
    # Momentum 12 mois: log(P_t) - log(P_{t-12})
    df_full[f"mom12_{col}"] = np.log(df_full[col]) - np.log(df_full[col].shift(12))



In [150]:
# Conversion complète en rendements/différences

# 1. FX : déjà fait avec logret_

# 2. Taux d'intérêt : différence première (changement de taux)
rate_cols = ["USA", "Japan", "Switzerland", "Australia", "China", "Canada"]
for col in rate_cols:
    df_full[f"delta_{col}"] = df_full[col] - df_full[col].shift(1)

# 3. Inflation : déjà en variation YoY, mais on peut prendre la différence
infl_cols = [c for c in df_full.columns if c.endswith("_infl")]
for col in infl_cols:
    df_full[f"delta_{col}"] = df_full[col] - df_full[col].shift(1)

# 4. VIX : rendement ou différence
df_full["logret_VIX"] = np.log(df_full["VIX"]) - np.log(df_full["VIX"].shift(1))
# OU différence première si tu préfères :
# df_full["delta_VIX"] = df_full["VIX"] - df_full["VIX"].shift(1)

# 5. Carry : déjà une différence (taux étranger - taux US) ✓
# Mais on peut prendre la variation du carry :
carry_cols = [c for c in df_full.columns if c.startswith("carry_")]
for col in carry_cols:
    df_full[f"delta_{col}"] = df_full[col] - df_full[col].shift(1)

# 6. Momentum : déjà des rendements cumulés 


In [160]:
# %% BLOC 10: Standardisation complète

# Supprimer les colonnes _std si elles existent déjà
cols_to_clean = [c for c in df_full.columns if c.endswith('_std')]
if cols_to_clean:
    df_full = df_full.drop(columns=cols_to_clean)
   
# Colonnes à standardiser (UNIQUEMENT rendements/différences)
cols_to_std = (
    [c for c in df_full.columns if c.startswith("logret_")] +  # Log-returns FX
    [c for c in df_full.columns if c.startswith("delta_")] +   # Différences (taux, inflation, VIX, carry)
    [c for c in df_full.columns if c.startswith("mom")] +      # Momentum (déjà des rendements)
    [c for c in df_full.columns if c.startswith("carry_")]     # Carry (différentiel)
)


for col in cols_to_std:
    if col in df_full.columns:
        mean = df_full[col].mean()
        std = df_full[col].std()
        if std > 0:
            df_full[f"{col}_std"] = (df_full[col] - mean) / std

std_cols = [c for c in df_full.columns if c.endswith('_std')]
print(f"✓ Standardisation appliquée à {len(std_cols)} variables en rendement")

# Vérification : afficher seulement les variables standardisées
print("\nVariables standardisées (toutes en rendement/différence) :")
for col in std_cols:
    print(f"  - {col}")

✓ Standardisation appliquée à 38 variables en rendement

Variables standardisées (toutes en rendement/différence) :
  - logret_exalus_std
  - logret_excaus_std
  - logret_exchus_std
  - logret_exjpus_std
  - logret_exszus_std
  - logret_VIX_std
  - delta_USA_std
  - delta_Japan_std
  - delta_Switzerland_std
  - delta_Australia_std
  - delta_China_std
  - delta_Canada_std
  - delta_USA_infl_std
  - delta_Switzerland_infl_std
  - delta_Japan_infl_std
  - delta_Canada_infl_std
  - delta_Australia_infl_std
  - delta_China_infl_std
  - delta_carry_Japan_USD_std
  - delta_carry_Switzerland_USD_std
  - delta_carry_Australia_USD_std
  - delta_carry_China_USD_std
  - delta_carry_Canada_USD_std
  - mom3_exalus_std
  - mom12_exalus_std
  - mom3_excaus_std
  - mom12_excaus_std
  - mom3_exchus_std
  - mom12_exchus_std
  - mom3_exjpus_std
  - mom12_exjpus_std
  - mom3_exszus_std
  - mom12_exszus_std
  - carry_Japan_USD_std
  - carry_Switzerland_USD_std
  - carry_Australia_USD_std
  - carry_China_USD

In [152]:
# Nettoyage final et validation
# Supprimer les lignes avec NaN (dues aux shifts pour momentum et log-returns)
df_full = df_full.dropna()

print(f"\n{'='*60}")
print(f"DATASET FINAL")
print(f"{'='*60}")
print(f"Observations: {len(df_full)}")
print(f"Période: {df_full['date'].min().date()} → {df_full['date'].max().date()}")
print(f"Variables totales: {len(df_full.columns)}")
print(f"Variables standardisées: {len(std_cols)}")

print(f"\nColonnes par catégorie:")
print(f"  FX brutes: {fx_cols}")
print(f"  Taux: {['USA', 'Japan', 'Switzerland', 'Australia', 'China', 'Canada']}")
print(f"  Inflation: {[c for c in df_full.columns if c.endswith('_infl')]}")
print(f"  Log-returns: {[c for c in df_full.columns if c.startswith('logret_')]}")
print(f"  Carry: {[c for c in df_full.columns if c.startswith('carry_')]}")
print(f"  Momentum: {[c for c in df_full.columns if c.startswith('mom')]}")
print(f"  VIX: ['VIX']")

print(f"\nPremières lignes (variables standardisées):")
display_cols = ['date'] + std_cols[:8]
print(df_full[display_cols].head())

print(f"\nDernières lignes (variables standardisées):")
print(df_full[display_cols].tail())

print(f"\nInfo dataset:")
print(df_full.info())




DATASET FINAL
Observations: 300
Période: 2000-02-29 → 2025-01-31
Variables totales: 95
Variables standardisées: 38

Colonnes par catégorie:
  FX brutes: ['exalus', 'excaus', 'exchus', 'exjpus', 'exszus']
  Taux: ['USA', 'Japan', 'Switzerland', 'Australia', 'China', 'Canada']
  Inflation: ['USA_infl', 'Switzerland_infl', 'Japan_infl', 'Canada_infl', 'Australia_infl', 'China_infl', 'delta_USA_infl', 'delta_Switzerland_infl', 'delta_Japan_infl', 'delta_Canada_infl', 'delta_Australia_infl', 'delta_China_infl']
  Log-returns: ['logret_exalus', 'logret_excaus', 'logret_exchus', 'logret_exjpus', 'logret_exszus', 'logret_VIX', 'logret_exalus_std', 'logret_excaus_std', 'logret_exchus_std', 'logret_exjpus_std', 'logret_exszus_std', 'logret_VIX_std']
  Carry: ['carry_Japan_USD', 'carry_Switzerland_USD', 'carry_Australia_USD', 'carry_China_USD', 'carry_Canada_USD', 'carry_Japan_USD_std', 'carry_Switzerland_USD_std', 'carry_Australia_USD_std', 'carry_China_USD_std', 'carry_Canada_USD_std']
  Momen