## 4-2 & 5-2 Research Hypothesis 2

In [1]:
# Import the necessary libraries
import os
import pandas as pd
from timezonefinder import TimezoneFinder
import matplotlib.pyplot as plt
import scipy.stats as stats

### Load the datasets

In [2]:
# Define a list of states for which datasets will be loaded
state_list = ['Texas', 'Iowa', 'Nevada', 'Seattle']

# Define the base folder path where the datasets are located
base_folder_path = './Data/'

# Create an empty dictionary to store DataFrames
dfs = {}

# Loop through each state and load the corresponding dataset
for state in state_list:
    # Construct the file path for the dataset of the current state
    file_path = os.path.join(base_folder_path, f'{state}_0_18.csv')
    
    # Define a key for the DataFrame in the dictionary
    df_key = f'df_{state}'
    
    # Load the dataset into a Pandas DataFrame and store it in the dictionary
    dfs[df_key] = pd.read_csv(file_path)

# List the keys (DataFrame names) stored in the 'dfs' dictionary
loaded_datasets = list(dfs)
print(loaded_datasets)

['df_Texas', 'df_Iowa', 'df_Nevada', 'df_Seattle']


### Convert the datetime from UTC to LST (Local Standard Time)

In [3]:
# Define a dictionary that maps state names to their corresponding latitude and longitude coordinates
locations = {
    'Texas': (30.6095651015, -96.3402969602),
    'Nevada': (36.089758655, -115.1833049303),
    'Iowa': (42.0135891695, -93.6356966723),
    'Seattle': (48.6108182606, -123.1479221551)
}

# Initialize a TimezoneFinder instance to determine timezones based on coordinates
tf = TimezoneFinder()

# Assuming 'dfs' is your dictionary of DataFrames
for df_key in dfs:
    # Extract the state name from the DataFrame key
    state = df_key.split('_')[1]

    # Retrieve the latitude and longitude for the state from the 'locations' dictionary
    latitude, longitude = locations[state]

    # Find the timezone for the given latitude and longitude using the TimezoneFinder
    local_timezone = tf.timezone_at(lat=latitude, lng=longitude)

    # Convert the 'datetime' column in the DataFrame to datetime objects
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Set the timezone of the 'datetime' column to UTC
    dfs[df_key]['datetime'] = dfs[df_key]['datetime'].dt.tz_localize('UTC')

    # Convert the 'datetime' column to the local timezone
    dfs[df_key]['datetime'] = dfs[df_key]['datetime'].dt.tz_convert(local_timezone)
    dfs[df_key]['datetime'] = dfs[df_key]['datetime'].dt.tz_localize(None)

### Extract only hour of 12 and 15 and forecast of 0 and 3

In [4]:
# Iterate through each DataFrame in the dictionary
for df_key in dfs:
    # Convert the 'datetime' column to pandas datetime format
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Define the desired times for filtering
    desired_times = ['12:00:00', '15:00:00']

    # Filter the DataFrame for hours between 12:00:00 and 15:00:00
    dfs[df_key] = dfs[df_key][dfs[df_key]['datetime'].dt.strftime('%H:%M:%S').isin(desired_times)]

    # Define the desired forecast values for filtering
    desired_forecasts = [0, 3]

    # Extract rows where forecast is either 0 or 3
    dfs[df_key] = dfs[df_key][dfs[df_key]['forecast'].isin(desired_forecasts)]

    # Reset the index of the filtered DataFrame
    dfs[df_key] = dfs[df_key].reset_index(drop=True)

### Calculates differences based on the observed values (when forecast == 0)

In [5]:
# Iterate through each DataFrame in the dictionary
for df_key in dfs:
    # Convert the 'datetime' column to pandas datetime format
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Define the desired times for filtering
    desired_times = ['12:00:00', '15:00:00']

    # Filter the DataFrame for hours between 12:00:00 and 15:00:00
    dfs[df_key] = dfs[df_key][dfs[df_key]['datetime'].dt.strftime('%H:%M:%S').isin(desired_times)]

    # Define the desired forecast values for filtering
    desired_forecasts = [0, 3]

    # Extract rows where forecast is either 0 or 3
    dfs[df_key] = dfs[df_key][dfs[df_key]['forecast'].isin(desired_forecasts)]

    # Reset the index of the filtered DataFrame
    dfs[df_key] = dfs[df_key].reset_index(drop=True)


# Iterate through each DataFrame in the dictionary
for df_key in dfs:
    # Convert the 'datetime' column to pandas datetime format
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Filter to get the rows where forecast == 0
    reference_df = dfs[df_key][dfs[df_key]['forecast'] == 0]

    # Merge the original DataFrame with the reference DataFrame
    merged_df = dfs[df_key].merge(reference_df[['datetime', 'ta', 'GHI', 'rh', 'va', 'cloud', 'WBGT']], on='datetime', suffixes=('', '_ref'))

    # Calculate the forecast error for each column
    merged_df['ta_error'] = merged_df['ta'] - merged_df['ta_ref']
    merged_df['GHI_error'] = merged_df['GHI'] - merged_df['GHI_ref']
    merged_df['rh_error'] = merged_df['rh'] - merged_df['rh_ref']
    merged_df['va_error'] = merged_df['va'] - merged_df['va_ref']
    merged_df['cloud_error'] = merged_df['cloud'] - merged_df['cloud_ref']
    merged_df['WBGT_error'] = merged_df['WBGT'] - merged_df['WBGT_ref']

    # Save the merged DataFrame back into the dictionary
    dfs[df_key] = merged_df

### Combined as one dataframe

In [11]:
#Rename each columns
df_T = dfs['df_Texas'][['datetime', 'GHI_error', 'cloud_error', 'ta_error', 'forecast']]
df_T = df_T.rename(columns={'GHI_error': 'GHI_error_Texas', 'cloud_error': 'cloud_error_Texas', 'ta_error': 'ta_error_Texas'})

df_I = dfs['df_Iowa'][['datetime', 'GHI_error', 'cloud_error', 'ta_error', 'forecast']]
df_I = df_I.rename(columns={'GHI_error': 'GHI_error_Iowa', 'cloud_error': 'cloud_error_Iowa', 'ta_error': 'ta_error_Iowa'})

df_N = dfs['df_Nevada'][['datetime', 'GHI_error', 'cloud_error', 'ta_error', 'forecast']]
df_N = df_N.rename(columns={'GHI_error': 'GHI_error_Nevada', 'cloud_error': 'cloud_error_Nevada', 'ta_error': 'ta_error_Nevada'})

df_S = dfs['df_Seattle'][['datetime', 'GHI_error', 'cloud_error', 'ta_error', 'forecast']]
df_S = df_S.rename(columns={'GHI_error': 'GHI_error_Seattle', 'cloud_error': 'cloud_error_Seattle', 'ta_error': 'ta_error_Seattle'})

# Custom suffixes can be provided to avoid column name conflicts
custom_suffixes = ('_dfT', '_dfI')  # You can choose appropriate suffixes

# Merging df_T and df_I with custom suffixes, on both 'datetime' and 'forecast'
combined_df = pd.merge(df_T, df_I, on=['datetime', 'forecast'], how='inner', suffixes=custom_suffixes)

# Merging with df_N and df_S
# Note: You may need to adjust suffixes again if there are still conflicts
combined_df = pd.merge(combined_df, df_N, on=['datetime', 'forecast'], how='inner')
combined_df = pd.merge(combined_df, df_S, on=['datetime', 'forecast'], how='inner')

combined_df.head()

Unnamed: 0,datetime,GHI_error_Texas,cloud_error_Texas,ta_error_Texas,forecast,GHI_error_Iowa,cloud_error_Iowa,ta_error_Iowa,GHI_error_Nevada,cloud_error_Nevada,ta_error_Nevada,GHI_error_Seattle,cloud_error_Seattle,ta_error_Seattle
0,2023-06-01 12:00:00,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-06-01 12:00:00,10.0,-19.0,0.10763,3,78.0,4.0,0.35763,13.0,0.0,0.93951,1.0,-18.0,-1.87299
2,2023-06-01 15:00:00,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-06-01 15:00:00,13.0,-12.0,-0.06998,3,-472.0,21.0,-0.19498,604.0,14.0,0.58713,4.0,0.0,-1.16287
4,2023-06-02 12:00:00,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#only 3-hour forecast
combined_df = combined_df[combined_df['forecast'] == 3]

In [13]:
combined_df.head()

Unnamed: 0,datetime,GHI_error_Texas,cloud_error_Texas,ta_error_Texas,forecast,GHI_error_Iowa,cloud_error_Iowa,ta_error_Iowa,GHI_error_Nevada,cloud_error_Nevada,ta_error_Nevada,GHI_error_Seattle,cloud_error_Seattle,ta_error_Seattle
1,2023-06-01 12:00:00,10.0,-19.0,0.10763,3,78.0,4.0,0.35763,13.0,0.0,0.93951,1.0,-18.0,-1.87299
3,2023-06-01 15:00:00,13.0,-12.0,-0.06998,3,-472.0,21.0,-0.19498,604.0,14.0,0.58713,4.0,0.0,-1.16287
5,2023-06-02 12:00:00,3.0,-21.0,0.10623,3,-2.0,-6.0,-1.51877,-3.0,0.0,1.03556,-1.0,0.0,-1.15194
7,2023-06-02 15:00:00,-101.0,3.0,0.07323,3,514.0,-1.0,-0.05177,8.0,0.0,-0.1604,-2.0,0.0,-0.5979
9,2023-06-03 12:00:00,-1.0,0.0,0.82907,3,-1.0,0.0,0.14157,0.0,0.0,1.38844,-2.0,0.0,-1.67406


### One-way ANOVA Test

In [9]:
import pandas as pd
from scipy import stats

# List of GHI_error columns for each state
ghi_columns = ['GHI_error_Texas', 'GHI_error_Iowa', 'GHI_error_Nevada', 'GHI_error_Seattle']

# List to store results
results_list = []

# Perform ANOVA for each pair of GHI_error columns
for i in range(len(ghi_columns)):
    for j in range(i+1, len(ghi_columns)):
        # Extract data for the two columns
        data_i = combined_df[ghi_columns[i]]
        data_j = combined_df[ghi_columns[j]]
        
        # Perform ANOVA
        f_value, p_value = stats.f_oneway(data_i, data_j)
        
        # Append the results to the list
        results_list.append({
            'Comparison': f'{ghi_columns[i]} vs {ghi_columns[j]}',
            'F-Value': f_value,
            'P-Value': p_value
        })

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results_list)

# Output the DataFrame
results_df


Unnamed: 0,Comparison,F-Value,P-Value
0,GHI_error_Texas vs GHI_error_Iowa,1.027748,0.311365
1,GHI_error_Texas vs GHI_error_Nevada,2.818132,0.094068
2,GHI_error_Texas vs GHI_error_Seattle,0.066559,0.79656
3,GHI_error_Iowa vs GHI_error_Nevada,0.299006,0.584843
4,GHI_error_Iowa vs GHI_error_Seattle,0.70084,0.403054
5,GHI_error_Nevada vs GHI_error_Seattle,2.316556,0.128876


In [14]:
# List of ta_error columns for each state
ta_columns = ['ta_error_Texas', 'ta_error_Iowa', 'ta_error_Nevada', 'ta_error_Seattle']

# List to store results
results_list = []

# Perform ANOVA for each pair of GHI_error columns
for i in range(len(ta_columns)):
    for j in range(i+1, len(ta_columns)):
        # Extract data for the two columns
        data_i = combined_df[ta_columns[i]]
        data_j = combined_df[ta_columns[j]]
        
        # Perform ANOVA
        f_value, p_value = stats.f_oneway(data_i, data_j)
        
        # Append the results to the list
        results_list.append({
            'Comparison': f'{ta_columns[i]} vs {ta_columns[j]}',
            'F-Value': f_value,
            'P-Value': p_value
        })

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results_list)

# Output the DataFrame
results_df

Unnamed: 0,Comparison,F-Value,P-Value
0,ta_error_Texas vs ta_error_Iowa,0.733015,0.3924739
1,ta_error_Texas vs ta_error_Nevada,26.268759,4.84492e-07
2,ta_error_Texas vs ta_error_Seattle,195.770188,7.333366999999999e-36
3,ta_error_Iowa vs ta_error_Nevada,11.563405,0.0007476492
4,ta_error_Iowa vs ta_error_Seattle,141.810266,8.13754e-28
5,ta_error_Nevada vs ta_error_Seattle,297.73059,4.116341e-49


In [10]:
# List of cloud_error columns for each state
cloud_columns = ['cloud_error_Texas', 'cloud_error_Iowa', 'cloud_error_Nevada', 'cloud_error_Seattle']

# Dictionary to store results
results = {}

# Perform ANOVA for each pair of cloud_error columns
for i in range(len(cloud_columns)):
    for j in range(i+1, len(cloud_columns)):
        # Extract data for the two columns
        data_i = combined_df[cloud_columns[i]]
        data_j = combined_df[cloud_columns[j]]
        
        # Perform ANOVA
        f_value, p_value = stats.f_oneway(data_i, data_j)
        
        # Store the results
        result_key = f'{cloud_columns[i]} vs {cloud_columns[j]}'
        results[result_key] = {'F-Value': f_value, 'P-Value': p_value}

# Output the results for each comparison
for comparison, result in results.items():
    print(f'Comparison: {comparison}')
    print('F-Value:', result['F-Value'])
    print('P-Value:', result['P-Value'])
    print()

Comparison: cloud_error_Texas vs cloud_error_Iowa
F-Value: 3.9841053639639057
P-Value: 0.04668024027938454

Comparison: cloud_error_Texas vs cloud_error_Nevada
F-Value: 5.15586699624259
P-Value: 0.023754926898199337

Comparison: cloud_error_Texas vs cloud_error_Seattle
F-Value: 0.11671535684947279
P-Value: 0.7328228504184215

Comparison: cloud_error_Iowa vs cloud_error_Nevada
F-Value: 16.02949500891713
P-Value: 7.568374224078618e-05

Comparison: cloud_error_Iowa vs cloud_error_Seattle
F-Value: 2.680958782255702
P-Value: 0.1024235138858061

Comparison: cloud_error_Nevada vs cloud_error_Seattle
F-Value: 6.364844442087399
P-Value: 0.012068071315575864

