In [1]:
import yfinance as yf
import pandas as pd
import time

# Define wind companies
wind_companies = ['VWS.CO', 'NEO.TO', 'TPIC', 'ORK.OL']  # Vestas, Neoen, TPI Composites, Orkla

# Function to fetch data with retries
def fetch_data(assets, start, end, retries=3):
    valid_data = []
    invalid_assets = {}

    for asset in assets:
        for attempt in range(retries):
            try:
                print(f"Fetching data for {asset} (Attempt {attempt + 1})...")
                data = yf.download(asset, start=start, end=end, progress=False)
                if not data.empty:
                    print(f"Data for {asset} fetched successfully.")
                    data['Asset'] = asset
                    valid_data.append(data)
                    break
                else:
                    print(f"Data for {asset} is empty. Retrying...")
            except Exception as e:
                print(f"Error fetching data for {asset}: {e}")
                if attempt == retries - 1:
                    invalid_assets[asset] = str(e)
                time.sleep(2)  # Wait before retrying

    return valid_data, invalid_assets

# Fetch wind company data
print("Fetching wind company data...")
wind_data, invalid_wind = fetch_data(wind_companies, start="2015-01-01", end="2025-01-01")
if wind_data:
    wind_df = pd.concat(wind_data).reset_index()
    wind_df.to_csv("wind_company_data.csv", index=False)
    print("Wind company data saved as 'wind_company_data.csv'.")
else:
    print("No valid wind company data fetched.")

# Log invalid fetches
if invalid_wind:
    with open("invalid_wind_assets_log.txt", "w") as log_file:
        log_file.write("Invalid Wind Companies:\n")
        for stock, reason in invalid_wind.items():
            log_file.write(f"{stock}: {reason}\n")
    print("Invalid wind company data logged.")


Fetching wind company data...
Fetching data for VWS.CO (Attempt 1)...
Data for VWS.CO fetched successfully.
Fetching data for NEO.TO (Attempt 1)...
Data for NEO.TO fetched successfully.
Fetching data for TPIC (Attempt 1)...
Data for TPIC fetched successfully.
Fetching data for ORK.OL (Attempt 1)...
Data for ORK.OL fetched successfully.
Wind company data saved as 'wind_company_data.csv'.


In [2]:
import pandas as pd

# Load the dataset
file_path = "wind_company_data.csv"
data = pd.read_csv(file_path)

# Step 1: Drop metadata row
data = data.iloc[1:].reset_index(drop=True)

# Step 2: Clean column names
data.columns = [col.replace('.', '_') for col in data.columns]

# Step 3: Restructure dataset
wind_companies = ['VWS.CO', 'NEO.TO', 'TPIC', 'ORK.OL']
long_data = pd.DataFrame()

# Process the first company (base columns)
base_columns = ['Close', 'High', 'Low', 'Open', 'Volume']
temp = data[['Date'] + base_columns].copy()
temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
temp['Asset'] = 'VWS.CO'  # Assuming VWS.CO as the first company
temp['Type'] = 'Wind'
long_data = pd.concat([long_data, temp], ignore_index=True)

# Process other companies
for i, company in enumerate(wind_companies[1:]):  # Skip VWS.CO as it was already processed
    company_columns = [f'{col}_{i+1}' for col in base_columns if f'{col}_{i+1}' in data.columns]
    if company_columns:
        temp = data[['Date'] + company_columns].copy()
        temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
        temp['Asset'] = company
        temp['Type'] = 'Wind'
        long_data = pd.concat([long_data, temp], ignore_index=True)

# Step 4: Convert data types
long_data['Date'] = pd.to_datetime(long_data['Date'], errors='coerce')
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
for col in numeric_cols:
    long_data[col] = pd.to_numeric(long_data[col], errors='coerce')

# Step 5: Filter rows with missing numeric data
long_data = long_data.dropna(subset=numeric_cols, how='all')

# Step 6: Save cleaned dataset
output_path = "final_wind_company_data.csv"
long_data.to_csv(output_path, index=False)
print(f"Final cleaned data saved to {output_path}")

# Display dataset summary
print(long_data.info())
print(long_data.head())


Final cleaned data saved to final_wind_company_data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 8909 entries, 0 to 35635
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    8909 non-null   datetime64[ns]
 1   Close   8909 non-null   float64       
 2   High    8909 non-null   float64       
 3   Low     8909 non-null   float64       
 4   Open    8909 non-null   float64       
 5   Volume  8909 non-null   float64       
 6   Asset   8909 non-null   object        
 7   Type    8909 non-null   object        
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 626.4+ KB
None
        Date      Close       High        Low       Open      Volume   Asset  \
0 2015-01-02  43.511150  43.764335  41.775046  42.136733  14235455.0  VWS.CO   
1 2015-01-05  43.022873  44.288781  42.842028  43.493065  13117040.0  VWS.CO   
2 2015-01-06  45.012157  45.193002  42.028226  42.661182  17033090.0  VWS.CO   
3 20