In [4]:
import yfinance as yf
import pandas as pd
import time

# Define stock companies, oil companies
oil_companies = ['XOM', 'CVX', 'BP', 'SHEL', 'TTE']  # ExxonMobil, Chevron, BP, Shell (SHEL), TotalEnergies (TTE)

# Function to fetch data with retries
def fetch_data(assets, start, end, retries=3):
    valid_data = []
    invalid_assets = {}

    for asset in assets:
        for attempt in range(retries):
            try:
                print(f"Fetching data for {asset} (Attempt {attempt + 1})...")
                data = yf.download(asset, start=start, end=end, progress=False)
                if not data.empty:
                    print(f"Data for {asset} fetched successfully.")
                    data['Asset'] = asset
                    valid_data.append(data)
                    break
                else:
                    print(f"Data for {asset} is empty. Retrying...")
            except Exception as e:
                print(f"Error fetching data for {asset}: {e}")
                if attempt == retries - 1:
                    invalid_assets[asset] = str(e)
                time.sleep(2)  # Wait before retrying

    return valid_data, invalid_assets

# Fetch oil company data
print("Fetching oil company data...")
oil_data, invalid_oil = fetch_data(oil_companies, start="2015-01-01", end="2025-01-01")
if oil_data:
    oil_df = pd.concat(oil_data).reset_index()
    oil_df.to_csv("oil_company_data.csv", index=False)
    print("Oil company data saved as 'oil_company_data.csv'.")
else:
    print("No valid oil company data fetched.")




print("Fetching process completed. Check 'oil_company_data.csv'for results.")


Fetching oil company data...
Fetching data for XOM (Attempt 1)...
Data for XOM fetched successfully.
Fetching data for CVX (Attempt 1)...
Data for CVX fetched successfully.
Fetching data for BP (Attempt 1)...
Data for BP fetched successfully.
Fetching data for SHEL (Attempt 1)...
Data for SHEL fetched successfully.
Fetching data for TTE (Attempt 1)...
Data for TTE fetched successfully.
Oil company data saved as 'oil_company_data.csv'.
Fetching process completed. Check 'oil_company_data.csv'for results.


In [8]:
import pandas as pd

# Load the dataset
file_path = "oil_company_data.csv"
data = pd.read_csv(file_path)

# Step 1: Drop metadata row
data = data.iloc[1:].reset_index(drop=True)

# Step 2: Clean column names
data.columns = [col.replace('.', '_') for col in data.columns]

# Step 3: Restructure dataset
oil_companies = ['XOM', 'CVX', 'BP', 'SHEL', 'TTE']
long_data = pd.DataFrame()

# Process the first company (base columns)
base_columns = ['Close', 'High', 'Low', 'Open', 'Volume']
temp = data[['Date'] + base_columns].copy()
temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
temp['Asset'] = 'XOM'  # Assuming XOM as the first company
temp['Type'] = 'Oil'
long_data = pd.concat([long_data, temp], ignore_index=True)

# Process other companies
for i, company in enumerate(oil_companies[1:]):  # Skip XOM as it was already processed
    company_columns = [f'{col}_{i+1}' for col in base_columns if f'{col}_{i+1}' in data.columns]
    if company_columns:
        temp = data[['Date'] + company_columns].copy()
        temp.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
        temp['Asset'] = company
        temp['Type'] = 'Oil'
        long_data = pd.concat([long_data, temp], ignore_index=True)

# Step 4: Convert data types
long_data['Date'] = pd.to_datetime(long_data['Date'], errors='coerce')
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
for col in numeric_cols:
    long_data[col] = pd.to_numeric(long_data[col], errors='coerce')

# Step 5: Filter rows with missing numeric data
long_data = long_data.dropna(subset=numeric_cols, how='all')

# Step 6: Save cleaned dataset
output_path = "final_oil_company_data.csv"
long_data.to_csv(output_path, index=False)
print(f"Final cleaned data saved to {output_path}")

# Display dataset summary
print(long_data.info())
print(long_data.head())
print(long_data.tail())


Final cleaned data saved to final_oil_company_data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 12580 entries, 0 to 62899
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    12580 non-null  datetime64[ns]
 1   Close   12580 non-null  float64       
 2   High    12580 non-null  float64       
 3   Low     12580 non-null  float64       
 4   Open    12580 non-null  float64       
 5   Volume  12580 non-null  float64       
 6   Asset   12580 non-null  object        
 7   Type    12580 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 884.5+ KB
None
        Date      Close       High        Low       Open      Volume Asset  \
0 2015-01-02  60.042545  60.184842  59.382805  59.667400  10220400.0   XOM   
1 2015-01-05  58.399666  59.764416  57.888692  59.570374  18502400.0   XOM   
2 2015-01-06  58.089180  59.124065  57.578207  58.367305  16670700.0   XOM   
3 2015-01-07