In [6]:
import yfinance as yf
import pandas as pd
import time

# Define stock companies, oil companies, wind energy companies, macro indicators, and separate commodities
solar_companies = ['FSLR', 'ENPH', 'SEDG', 'CSIQ', 'RUN', 'JKS', 'NEE', 'TPIC', 'ORA', 'MAXN']
oil_companies = ['XOM', 'CVX', 'BP', 'SHEL', 'TTE']  # ExxonMobil, Chevron, BP, Shell (SHEL), TotalEnergies (TTE)
wind_companies = ['VWS.CO', 'NEO.TO', 'TPIC', 'ORK.OL']  # Vestas, Neoen, TPI Composites, Orkla
indicators = ['ICLN', 'PBW', 'TAN', 'XLE', 'VDE', 'SPY', 'QQQ', 'VIX', 'EEM', 'LIT', 'URA', 'HYG', 'EURUSD=X', 'GBPUSD=X']
commodities = ['CL=F', 'NG=F', 'DXY', 'GC=F', 'SI=F']  # Crude oil, natural gas, USD index, gold, silver
macro_indicators = ['^IRX', '^TNX', '^TYX', 'MSCI', '^FTSE', '^N225', 'BTC-USD', 'ETH-USD']  # Treasury yields, MSCI, global indices, crypto

# Function to fetch data with retries
def fetch_data(assets, start, end, retries=3):
    valid_data = []
    invalid_assets = {}

    for asset in assets:
        for attempt in range(retries):
            try:
                print(f"Fetching data for {asset} (Attempt {attempt + 1})...")
                data = yf.download(asset, start=start, end=end, progress=False)
                if not data.empty:
                    print(f"Data for {asset} fetched successfully.")
                    data['Asset'] = asset
                    valid_data.append(data)
                    break
                else:
                    print(f"Data for {asset} is empty. Retrying...")
            except Exception as e:
                print(f"Error fetching data for {asset}: {e}")
                if attempt == retries - 1:
                    invalid_assets[asset] = str(e)
                time.sleep(2)  # Wait before retrying

    return valid_data, invalid_assets

# Fetch solar company data
print("Fetching solar company data...")
solar_data, invalid_solar = fetch_data(solar_companies, start="2015-01-01", end="2025-01-01")
if solar_data:
    solar_df = pd.concat(solar_data).reset_index()
    solar_df.to_csv("solar_company_data.csv", index=False)
    print("Solar company data saved as 'solar_company_data.csv'.")
else:
    print("No valid solar company data fetched.")

# Fetch oil company data
print("Fetching oil company data...")
oil_data, invalid_oil = fetch_data(oil_companies, start="2015-01-01", end="2025-01-01")
if oil_data:
    oil_df = pd.concat(oil_data).reset_index()
    oil_df.to_csv("oil_company_data.csv", index=False)
    print("Oil company data saved as 'oil_company_data.csv'.")
else:
    print("No valid oil company data fetched.")

# Fetch wind company data
print("Fetching wind energy company data...")
wind_data, invalid_wind = fetch_data(wind_companies, start="2015-01-01", end="2025-01-01")
if wind_data:
    wind_df = pd.concat(wind_data).reset_index()
    wind_df.to_csv("wind_company_data.csv", index=False)
    print("Wind energy company data saved as 'wind_company_data.csv'.")
else:
    print("No valid wind energy company data fetched.")

# Fetch indicator data
print("Fetching indicator data...")
indicator_data, invalid_indicators = fetch_data(indicators, start="2015-01-01", end="2025-01-01")
if indicator_data:
    indicator_df = pd.concat(indicator_data).reset_index()
    indicator_df.to_csv("indicator_data.csv", index=False)
    print("Indicator data saved as 'indicator_data.csv'.")
else:
    print("No valid indicator data fetched.")

# Fetch commodities data
print("Fetching commodities data...")
commodities_data, invalid_commodities = fetch_data(commodities, start="2015-01-01", end="2025-01-01")
if commodities_data:
    commodities_df = pd.concat(commodities_data).reset_index()
    commodities_df.to_csv("commodities_data.csv", index=False)
    print("Commodities data saved as 'commodities_data.csv'.")
else:
    print("No valid commodities data fetched.")

# Fetch macro indicator data
print("Fetching macro indicators data...")
macro_data, invalid_macro = fetch_data(macro_indicators, start="2015-01-01", end="2025-01-01")
if macro_data:
    macro_df = pd.concat(macro_data).reset_index()
    macro_df.to_csv("macro_indicators_data.csv", index=False)
    print("Macro indicators data saved as 'macro_indicators_data.csv'.")
else:
    print("No valid macro indicators data fetched.")

# Save invalid fetches for review
with open("invalid_assets_log.txt", "w") as log_file:
    log_file.write("Invalid Solar Companies:\n")
    for stock, reason in invalid_solar.items():
        log_file.write(f"{stock}: {reason}\n")

    log_file.write("\nInvalid Oil Companies:\n")
    for stock, reason in invalid_oil.items():
        log_file.write(f"{stock}: {reason}\n")

    log_file.write("\nInvalid Wind Companies:\n")
    for stock, reason in invalid_wind.items():
        log_file.write(f"{stock}: {reason}\n")

    log_file.write("\nInvalid Indicators:\n")
    for indicator, reason in invalid_indicators.items():
        log_file.write(f"{indicator}: {reason}\n")

    log_file.write("\nInvalid Commodities:\n")
    for commodity, reason in invalid_commodities.items():
        log_file.write(f"{commodity}: {reason}\n")

    log_file.write("\nInvalid Macro Indicators:\n")
    for macro, reason in invalid_macro.items():
        log_file.write(f"{macro}: {reason}\n")

print("Fetching process completed. Check 'solar_company_data.csv', 'oil_company_data.csv', 'wind_company_data.csv', 'indicator_data.csv', 'commodities_data.csv', 'macro_indicators_data.csv', and 'invalid_assets_log.txt' for results.")


Fetching solar company data...
Fetching data for FSLR (Attempt 1)...
Data for FSLR fetched successfully.
Fetching data for ENPH (Attempt 1)...
Data for ENPH fetched successfully.
Fetching data for SEDG (Attempt 1)...
Data for SEDG fetched successfully.
Fetching data for CSIQ (Attempt 1)...
Data for CSIQ fetched successfully.
Fetching data for RUN (Attempt 1)...
Data for RUN fetched successfully.
Fetching data for JKS (Attempt 1)...
Data for JKS fetched successfully.
Fetching data for NEE (Attempt 1)...
Data for NEE fetched successfully.
Fetching data for TPIC (Attempt 1)...
Data for TPIC fetched successfully.
Fetching data for ORA (Attempt 1)...
Data for ORA fetched successfully.
Fetching data for MAXN (Attempt 1)...
Data for MAXN fetched successfully.
Solar company data saved as 'solar_company_data.csv'.
Fetching oil company data...
Fetching data for XOM (Attempt 1)...
Data for XOM fetched successfully.
Fetching data for CVX (Attempt 1)...
Data for CVX fetched successfully.
Fetching d

In [11]:
solar_stock = pd.read_csv('solar_company_data.csv')
print(solar_stock.info())
print(solar_stock.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23143 entries, 0 to 23142
Data columns (total 52 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      23142 non-null  object
 1   Close     2517 non-null   object
 2   High      2517 non-null   object
 3   Low       2517 non-null   object
 4   Open      2517 non-null   object
 5   Volume    2517 non-null   object
 6   Asset     23142 non-null  object
 7   Close.1   2517 non-null   object
 8   High.1    2517 non-null   object
 9   Low.1     2517 non-null   object
 10  Open.1    2517 non-null   object
 11  Volume.1  2517 non-null   object
 12  Close.2   2460 non-null   object
 13  High.2    2460 non-null   object
 14  Low.2     2460 non-null   object
 15  Open.2    2460 non-null   object
 16  Volume.2  2460 non-null   object
 17  Close.3   2517 non-null   object
 18  High.3    2517 non-null   object
 19  Low.3     2517 non-null   object
 20  Open.3    2517 non-null   object
 21  Volume.3  25

  solar_stock = pd.read_csv('solar_company_data.csv')


In [14]:
import pandas as pd

# Load the dataset
file_path = "solar_company_data.csv"
data = pd.read_csv(file_path)

# Step 1: Drop metadata row
data = data.iloc[1:].reset_index(drop=True)

# Step 2: Clean column names
data.columns = [col.replace('.', '_') for col in data.columns]

# Step 3: Debug - Check the first few rows and columns
print("Columns:", data.columns)
print(data.head())

# Step 4: Restructure the dataset
assets = ['FSLR', 'ENPH', 'SEDG', 'CSIQ', 'RUN', 'JKS', 'NEE', 'TPIC', 'ORA', 'MAXN']
long_data = pd.DataFrame()

for i, asset in enumerate(assets):
    # Identify columns for this asset
    asset_columns = [f'{col}_{i}' for col in ['Close', 'High', 'Low', 'Open', 'Volume'] if f'{col}_{i}' in data.columns]
    print(f"Processing {asset}: {asset_columns}")  # Debugging: Check the columns being processed
    if asset_columns:
        temp = data[['Date'] + asset_columns].copy()
        temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
        temp['Asset'] = asset
        temp['Type'] = 'Stock'
        long_data = pd.concat([long_data, temp], ignore_index=True)

# Step 5: Convert data types
long_data['Date'] = pd.to_datetime(long_data['Date'], errors='coerce')
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
for col in numeric_cols:
    long_data[col] = pd.to_numeric(long_data[col], errors='coerce')

# Step 6: Save cleaned dataset
output_path = "debugged_solar_company_data.csv"
long_data.to_csv(output_path, index=False)
print(f"Debugged data saved to {output_path}")

# Display dataset summary
print(long_data.info())
print(long_data.head())




  data = pd.read_csv(file_path)


Columns: Index(['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'Asset', 'Close_1',
       'High_1', 'Low_1', 'Open_1', 'Volume_1', 'Close_2', 'High_2', 'Low_2',
       'Open_2', 'Volume_2', 'Close_3', 'High_3', 'Low_3', 'Open_3',
       'Volume_3', 'Close_4', 'High_4', 'Low_4', 'Open_4', 'Volume_4',
       'Close_5', 'High_5', 'Low_5', 'Open_5', 'Volume_5', 'Close_6', 'High_6',
       'Low_6', 'Open_6', 'Volume_6', 'Close_7', 'High_7', 'Low_7', 'Open_7',
       'Volume_7', 'Close_8', 'High_8', 'Low_8', 'Open_8', 'Volume_8',
       'Close_9', 'High_9', 'Low_9', 'Open_9', 'Volume_9'],
      dtype='object')
         Date               Close                High                 Low  \
0  2015-01-02   44.54999923706055   45.34000015258789  43.720001220703125   
1  2015-01-05   41.83000183105469   43.97999954223633    41.5099983215332   
2  2015-01-06   40.86000061035156    42.0099983215332  39.880001068115234   
3  2015-01-07               41.75  42.470001220703125  41.400001525878906   


In [15]:
import pandas as pd

# Load the dataset
file_path = "solar_company_data.csv"
data = pd.read_csv(file_path)

# Step 1: Drop metadata row
data = data.iloc[1:].reset_index(drop=True)

# Step 2: Clean column names
data.columns = [col.replace('.', '_') for col in data.columns]

# Step 3: Restructure dataset
assets = ['FSLR', 'ENPH', 'SEDG', 'CSIQ', 'RUN', 'JKS', 'NEE', 'TPIC', 'ORA', 'MAXN']
long_data = pd.DataFrame()

# Process FSLR (base columns)
base_columns = ['Close', 'High', 'Low', 'Open', 'Volume']
temp = data[['Date'] + base_columns].copy()
temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
temp['Asset'] = 'FSLR'
temp['Type'] = 'Stock'
long_data = pd.concat([long_data, temp], ignore_index=True)

# Process other assets
for i, asset in enumerate(assets[1:]):  # Skip FSLR as it was already processed
    asset_columns = [f'{col}_{i+1}' for col in base_columns if f'{col}_{i+1}' in data.columns]
    if asset_columns:
        temp = data[['Date'] + asset_columns].copy()
        temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
        temp['Asset'] = asset
        temp['Type'] = 'Stock'
        long_data = pd.concat([long_data, temp], ignore_index=True)

# Step 4: Convert data types
long_data['Date'] = pd.to_datetime(long_data['Date'], errors='coerce')
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
for col in numeric_cols:
    long_data[col] = pd.to_numeric(long_data[col], errors='coerce')

# Step 5: Filter rows with missing numeric data
long_data = long_data.dropna(subset=numeric_cols, how='all')

# Step 6: Save cleaned dataset
output_path = "final_solar_company_data.csv"
long_data.to_csv(output_path, index=False)
print(f"Final cleaned data saved to {output_path}")

# Display dataset summary
print(long_data.info())
print(long_data.head())




  data = pd.read_csv(file_path)


Final cleaned data saved to final_solar_company_data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 23142 entries, 0 to 231419
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    23142 non-null  datetime64[ns]
 1   Close   23142 non-null  float64       
 2   High    23142 non-null  float64       
 3   Low     23142 non-null  float64       
 4   Open    23142 non-null  float64       
 5   Volume  23142 non-null  float64       
 6   Asset   23142 non-null  object        
 7   Type    23142 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 1.6+ MB
None
        Date      Close       High        Low       Open     Volume Asset  \
0 2015-01-02  44.549999  45.340000  43.720001  44.720001  1873800.0  FSLR   
1 2015-01-05  41.830002  43.980000  41.509998  43.880001  3668000.0  FSLR   
2 2015-01-06  40.860001  42.009998  39.880001  42.009998  3712200.0  FSLR   
3 2015-01-07  4

In [16]:
# Display unique values in the Asset column
unique_assets = long_data['Asset'].unique()
print("Unique Assets:", unique_assets)


Unique Assets: ['FSLR' 'ENPH' 'SEDG' 'CSIQ' 'RUN' 'JKS' 'NEE' 'TPIC' 'ORA' 'MAXN']


In [17]:
import pandas as pd

# Load the dataset
file_path = "final_solar_company_data.csv"
df = pd.read_csv(file_path)

# Ensure Date is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Feature Engineering

# 1. Daily Range
df['Daily_Range'] = df['High'] - df['Low']
df['Normalized_Range'] = df['Daily_Range'] / df['Close']

# 2. Daily Return
df['Daily_Return'] = (df['Close'] - df['Open']) / df['Open']

# 3. Moving Averages
df['MA_7'] = df.groupby('Asset')['Close'].transform(lambda x: x.rolling(window=7).mean())
df['MA_30'] = df.groupby('Asset')['Close'].transform(lambda x: x.rolling(window=30).mean())

# 4. Momentum
df['Momentum'] = df.groupby('Asset')['Close'].transform(lambda x: x.diff())

# 5. Volume Metrics
df['Volume_Change'] = df.groupby('Asset')['Volume'].transform(lambda x: x.pct_change())
df['Volume_to_Price'] = df['Volume'] / df['Close']

# 6. Lag Features
df['Lag_Close_1'] = df.groupby('Asset')['Close'].shift(1)
df['Lag_Close_7'] = df.groupby('Asset')['Close'].shift(7)

# 7. Weighted Average Price (WAP) for the market
df['Weighted_Avg_Price'] = df['Close'] * df['Volume']

# 8. Calculate Overall Market WAP
market_wap = df.groupby('Date').apply(lambda x: x['Weighted_Avg_Price'].sum() / x['Volume'].sum())
market_wap = market_wap.reset_index(name='Market_WAP')

# Merge WAP back into the dataset
df = df.merge(market_wap, on='Date', how='left')

# 9. Moving Average of Market WAP
df['Market_WAP_MA_30'] = df['Market_WAP'].rolling(window=30).mean()

# 10. Determine Bull/Bear Market
df['Market_Trend'] = df.apply(
    lambda row: 'Bull' if row['Market_WAP'] > row['Market_WAP_MA_30'] else 'Bear',
    axis=1
)

# Drop rows with NaN values caused by rolling windows
df = df.dropna()

# Save the enhanced dataset
output_path = "enhanced_solar_company_data.csv"
df.to_csv(output_path, index=False)
print(f"Enhanced dataset with all features saved to '{output_path}'.")

# Display the first few rows of the enhanced dataset
print(df.head())


  market_wap = df.groupby('Date').apply(lambda x: x['Weighted_Avg_Price'].sum() / x['Volume'].sum())


Enhanced dataset with all features saved to 'enhanced_solar_company_data.csv'.
         Date      Close       High        Low       Open     Volume Asset  \
29 2015-02-13  48.840000  49.990002  48.619999  49.639999  2695900.0  FSLR   
30 2015-02-17  49.619999  49.849998  48.330002  49.160000  2467800.0  FSLR   
31 2015-02-18  48.840000  50.790001  48.590000  49.599998  3780900.0  FSLR   
32 2015-02-19  48.410000  48.980000  47.349998  48.070000  1971700.0  FSLR   
33 2015-02-20  49.020000  49.619999  48.200001  48.660000  2091000.0  FSLR   

     Type  Daily_Range  Normalized_Range  ...      MA_30  Momentum  \
29  Stock     1.370003          0.028051  ...  44.092667 -0.579998   
30  Stock     1.519997          0.030633  ...  44.261667  0.779999   
31  Stock     2.200001          0.045045  ...  44.495334 -0.779999   
32  Stock     1.630001          0.033671  ...  44.747000 -0.430000   
33  Stock     1.419998          0.028968  ...  44.989334  0.610001   

    Volume_Change  Volume_to_Pr

In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Load the enhanced dataset
file_path = "enhanced_solar_company_data.csv"
df = pd.read_csv(file_path)

# Ensure Date is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# 1. Exponential Moving Averages (EMA)
df['EMA_7'] = df.groupby('Asset')['Close'].transform(lambda x: x.ewm(span=7, adjust=False).mean())
df['EMA_30'] = df.groupby('Asset')['Close'].transform(lambda x: x.ewm(span=30, adjust=False).mean())

# 2. Bollinger Bands
df['Bollinger_Upper'] = df['MA_30'] + 2 * df.groupby('Asset')['Close'].transform(lambda x: x.rolling(window=30).std())
df['Bollinger_Lower'] = df['MA_30'] - 2 * df.groupby('Asset')['Close'].transform(lambda x: x.rolling(window=30).std())

# 3. Seasonal Features
df['Day_of_Week'] = df['Date'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['Month'] = df['Date'].dt.month  # 1 = January, 12 = December
df['Quarter'] = df['Date'].dt.quarter  # 1, 2, 3, or 4

# 4. Cumulative Returns
df['Cumulative_Return'] = df.groupby('Asset')['Close'].transform(lambda x: (x / x.iloc[0]) - 1)

# 5. Trend Strength (Slope of Linear Regression over 30-day rolling window)
def calculate_slope(series):
    if len(series) < 30:  # Ignore if less than 30 points
        return np.nan
    x = np.arange(len(series)).reshape(-1, 1)
    y = series.values.reshape(-1, 1)
    model = LinearRegression().fit(x, y)
    return model.coef_[0][0]

df['Trend_Strength'] = df.groupby('Asset')['Close'].transform(
    lambda x: x.rolling(window=30).apply(calculate_slope, raw=False)
)

# Drop rows with NaN values caused by rolling windows
df = df.dropna()

# Save the updated dataset with advanced features
output_path = "advanced_enhanced_solar_company_data.csv"
df.to_csv(output_path, index=False)
print(f"Dataset with advanced features saved to '{output_path}'.")

# Display the first few rows of the updated dataset
df.head()


Dataset with advanced features saved to 'advanced_enhanced_solar_company_data.csv'.


Unnamed: 0,Date,Close,High,Low,Open,Volume,Asset,Type,Daily_Range,Normalized_Range,...,Market_Trend,EMA_7,EMA_30,Bollinger_Upper,Bollinger_Lower,Day_of_Week,Month,Quarter,Cumulative_Return,Trend_Strength
29,2015-03-27,59.610001,60.09,58.790001,59.400002,1321400.0,FSLR,Stock,1.299999,0.021808,...,Bear,60.257839,58.021932,67.327261,48.529406,4,3,1,0.220516,0.396051
30,2015-03-30,60.689999,61.049999,59.720001,59.720001,1535400.0,FSLR,Stock,1.329998,0.021915,...,Bear,60.365879,58.194065,67.118411,49.528256,0,3,1,0.242629,0.35119
31,2015-03-31,59.790001,60.689999,59.419998,60.029999,1797300.0,FSLR,Stock,1.27,0.021241,...,Bear,60.221909,58.297029,66.830966,50.493701,1,3,1,0.224201,0.30063
32,2015-04-01,60.830002,61.310001,59.009998,59.84,1759000.0,FSLR,Stock,2.300003,0.03781,...,Bear,60.373932,58.460446,66.369966,51.754035,2,4,2,0.245496,0.246874
33,2015-04-02,61.189999,61.950001,60.779999,61.0,1228100.0,FSLR,Stock,1.170002,0.019121,...,Bear,60.577949,58.636546,65.622303,53.353698,3,4,2,0.252866,0.187141
