In [2]:
import yfinance as yf
import pandas as pd
import time

# Define the categories
indicators = ['ICLN', 'PBW', 'TAN', 'XLE', 'VDE', 'SPY', 'QQQ', 'VIX', 'EEM', 'LIT', 'URA', 'HYG', 'EURUSD=X', 'GBPUSD=X']
commodities = ['CL=F', 'NG=F', 'DXY', 'GC=F', 'SI=F']  # Crude oil, natural gas, USD index, gold, silver
macro_indicators = ['^IRX', '^TNX', '^TYX', 'MSCI', '^FTSE', '^N225', 'BTC-USD', 'ETH-USD']  # Treasury yields, MSCI, global indices, crypto

# Combine all symbols
all_assets = indicators + commodities + macro_indicators

# Function to fetch data with retries
def fetch_data(assets, start, end, retries=3):
    valid_data = []
    invalid_assets = {}

    for asset in assets:
        for attempt in range(retries):
            try:
                print(f"Fetching data for {asset} (Attempt {attempt + 1})...")
                data = yf.download(asset, start=start, end=end, progress=False)
                if not data.empty:
                    print(f"Data for {asset} fetched successfully.")
                    data['Asset'] = asset
                    valid_data.append(data)
                    break
                else:
                    print(f"Data for {asset} is empty. Retrying...")
            except Exception as e:
                print(f"Error fetching data for {asset}: {e}")
                if attempt == retries - 1:
                    invalid_assets[asset] = str(e)
                time.sleep(2)  # Wait before retrying

    return valid_data, invalid_assets

# Fetch all data
print("Fetching data for indicators, commodities, and macro indicators...")
all_data, invalid_assets = fetch_data(all_assets, start="2015-01-01", end="2025-01-01")

# Save valid data
if all_data:
    combined_data = pd.concat(all_data).reset_index()
    combined_data.to_csv("indicators_commodities_macro_data.csv", index=False)
    print("Data saved as 'indicators_commodities_macro_data.csv'.")
else:
    print("No valid data fetched.")

# Log invalid fetches
if invalid_assets:
    with open("invalid_indicators_log.txt", "w") as log_file:
        log_file.write("Invalid Indicators, Commodities, and Macro Indicators:\n")
        for asset, reason in invalid_assets.items():
            log_file.write(f"{asset}: {reason}\n")
    print("Invalid data logged.")




Fetching data for indicators, commodities, and macro indicators...
Fetching data for ICLN (Attempt 1)...
Data for ICLN fetched successfully.
Fetching data for PBW (Attempt 1)...
Data for PBW fetched successfully.
Fetching data for TAN (Attempt 1)...
Data for TAN fetched successfully.
Fetching data for XLE (Attempt 1)...
Data for XLE fetched successfully.
Fetching data for VDE (Attempt 1)...
Data for VDE fetched successfully.
Fetching data for SPY (Attempt 1)...
Data for SPY fetched successfully.
Fetching data for QQQ (Attempt 1)...
Data for QQQ fetched successfully.
Fetching data for VIX (Attempt 1)...
Data for VIX fetched successfully.
Fetching data for EEM (Attempt 1)...
Data for EEM fetched successfully.
Fetching data for LIT (Attempt 1)...
Data for LIT fetched successfully.
Fetching data for URA (Attempt 1)...
Data for URA fetched successfully.
Fetching data for HYG (Attempt 1)...
Data for HYG fetched successfully.
Fetching data for EURUSD=X (Attempt 1)...
Data for EURUSD=X fetched

In [3]:
# Cleaning fetched data
print("Cleaning fetched data...")
file_path = "indicators_commodities_macro_data.csv"
data = pd.read_csv(file_path)

# Step 1: Drop metadata row
data = data.iloc[1:].reset_index(drop=True)

# Step 2: Clean column names
data.columns = [col.replace('.', '_') for col in data.columns]

# Step 3: Restructure dataset
assets = indicators + commodities + macro_indicators
long_data = pd.DataFrame()

# Process the first asset (base columns)
base_columns = ['Close', 'High', 'Low', 'Open', 'Volume']
temp = data[['Date'] + base_columns].copy()
temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
temp['Asset'] = assets[0]  # Assuming the first asset
temp['Type'] = 'Indicator'
long_data = pd.concat([long_data, temp], ignore_index=True)

# Process other assets
for i, asset in enumerate(assets[1:]):
    asset_columns = [f'{col}_{i+1}' for col in base_columns if f'{col}_{i+1}' in data.columns]
    if asset_columns:
        temp = data[['Date'] + asset_columns].copy()
        temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
        temp['Asset'] = asset
        temp['Type'] = 'Indicator'
        long_data = pd.concat([long_data, temp], ignore_index=True)

# Step 4: Convert data types
long_data['Date'] = pd.to_datetime(long_data['Date'], errors='coerce')
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
for col in numeric_cols:
    long_data[col] = pd.to_numeric(long_data[col], errors='coerce')

# Step 5: Filter rows with missing numeric data
long_data = long_data.dropna(subset=numeric_cols, how='all')

# Step 6: Save cleaned dataset
output_path = "final_indicators_commodities_macro_data.csv"
long_data.to_csv(output_path, index=False)
print(f"Final cleaned data saved to {output_path}")

# Display dataset summary
print(long_data.info())
print(long_data.head())

Cleaning fetched data...


  data = pd.read_csv(file_path)


Final cleaned data saved to final_indicators_commodities_macro_data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 66119 entries, 0 to 1785212
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    66119 non-null  datetime64[ns]
 1   Close   66119 non-null  float64       
 2   High    66119 non-null  float64       
 3   Low     66119 non-null  float64       
 4   Open    66119 non-null  float64       
 5   Volume  66119 non-null  float64       
 6   Asset   66119 non-null  object        
 7   Type    66119 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 4.5+ MB
None
        Date     Close      High       Low      Open    Volume Asset  \
0 2015-01-02  8.124761  8.199759  8.033097  8.199759   52200.0  ICLN   
1 2015-01-05  7.949766  8.166426  7.891434  8.166426   34500.0  ICLN   
2 2015-01-06  7.933097  8.033094  7.874765  7.999761   18100.0  ICLN   
3 2015-01-07  8.008