## 4-3 & 5-3 Research Hypothesis 3

In [1]:
# Import the necessary libraries
import os
import pandas as pd
import numpy as np
from timezonefinder import TimezoneFinder
import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

### Load the datasets

In [2]:
# Define a list of states for which datasets will be loaded
state_list = ['Texas', 'Iowa', 'Nevada', 'Seattle']

# Define the base folder path where the datasets are located
base_folder_path = './Data/'

# Create an empty dictionary to store DataFrames
dfs = {}

# Loop through each state and load the corresponding dataset
for state in state_list:
    # Construct the file path for the dataset of the current state
    file_path = os.path.join(base_folder_path, f'{state}_0_18.csv')
    
    # Define a key for the DataFrame in the dictionary
    df_key = f'df_{state}'
    
    # Load the dataset into a Pandas DataFrame and store it in the dictionary
    dfs[df_key] = pd.read_csv(file_path)

# List the keys (DataFrame names) stored in the 'dfs' dictionary
loaded_datasets = list(dfs)
print(loaded_datasets)

['df_Texas', 'df_Iowa', 'df_Nevada', 'df_Seattle']


### Convert the datetime from UTC to LST (Local Standard Time)

In [3]:
# Define a dictionary that maps state names to their corresponding latitude and longitude coordinates
locations = {
    'Texas': (30.6095651015, -96.3402969602),
    'Nevada': (36.089758655, -115.1833049303),
    'Iowa': (42.0135891695, -93.6356966723),
    'Seattle': (48.6108182606, -123.1479221551)
}

# Initialize a TimezoneFinder instance to determine timezones based on coordinates
tf = TimezoneFinder()

# Assuming 'dfs' is your dictionary of DataFrames
for df_key in dfs:
    # Extract the state name from the DataFrame key
    state = df_key.split('_')[1]

    # Retrieve the latitude and longitude for the state from the 'locations' dictionary
    latitude, longitude = locations[state]

    # Find the timezone for the given latitude and longitude using the TimezoneFinder
    local_timezone = tf.timezone_at(lat=latitude, lng=longitude)

    # Convert the 'datetime' column in the DataFrame to datetime objects
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Set the timezone of the 'datetime' column to UTC
    dfs[df_key]['datetime'] = dfs[df_key]['datetime'].dt.tz_localize('UTC')

    # Convert the 'datetime' column to the local timezone
    dfs[df_key]['datetime'] = dfs[df_key]['datetime'].dt.tz_convert(local_timezone)
    dfs[df_key]['datetime'] = dfs[df_key]['datetime'].dt.tz_localize(None)

### Extract only hour of 12 and 15 and forecast of 0 and 3

In [4]:
# Iterate through each DataFrame in the dictionary
for df_key in dfs:
    # Convert the 'datetime' column to pandas datetime format
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Define the desired times for filtering
    desired_times = ['12:00:00', '15:00:00']

    # Filter the DataFrame for hours between 12:00:00 and 15:00:00
    dfs[df_key] = dfs[df_key][dfs[df_key]['datetime'].dt.strftime('%H:%M:%S').isin(desired_times)]

    # Define the desired forecast values for filtering
    desired_forecasts = [0, 3]

    # Extract rows where forecast is either 0 or 3
    dfs[df_key] = dfs[df_key][dfs[df_key]['forecast'].isin(desired_forecasts)]

    # Reset the index of the filtered DataFrame
    dfs[df_key] = dfs[df_key].reset_index(drop=True)

### Calculates differences based on the observed values (when forecast == 0)

In [5]:
# Iterate through each DataFrame in the dictionary
for df_key in dfs:
    # Convert the 'datetime' column to pandas datetime format
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Define the desired times for filtering
    desired_times = ['12:00:00', '15:00:00']

    # Filter the DataFrame for hours between 12:00:00 and 15:00:00
    dfs[df_key] = dfs[df_key][dfs[df_key]['datetime'].dt.strftime('%H:%M:%S').isin(desired_times)]

    # Define the desired forecast values for filtering
    desired_forecasts = [0, 3]

    # Extract rows where forecast is either 0 or 3
    dfs[df_key] = dfs[df_key][dfs[df_key]['forecast'].isin(desired_forecasts)]

    # Reset the index of the filtered DataFrame
    dfs[df_key] = dfs[df_key].reset_index(drop=True)


# Iterate through each DataFrame in the dictionary
for df_key in dfs:
    # Convert the 'datetime' column to pandas datetime format
    dfs[df_key]['datetime'] = pd.to_datetime(dfs[df_key]['datetime'])

    # Filter to get the rows where forecast == 0
    reference_df = dfs[df_key][dfs[df_key]['forecast'] == 0]

    # Merge the original DataFrame with the reference DataFrame
    merged_df = dfs[df_key].merge(reference_df[['datetime', 'ta', 'GHI', 'rh', 'va', 'cloud', 'WBGT']], on='datetime', suffixes=('', '_ref'))

    # Calculate the forecast error for each column
    merged_df['ta_error'] = merged_df['ta'] - merged_df['ta_ref']
    merged_df['GHI_error'] = merged_df['GHI'] - merged_df['GHI_ref']
    merged_df['rh_error'] = merged_df['rh'] - merged_df['rh_ref']
    merged_df['va_error'] = merged_df['va'] - merged_df['va_ref']
    merged_df['cloud_error'] = merged_df['cloud'] - merged_df['cloud_ref']
    merged_df['WBGT_error'] = merged_df['WBGT'] - merged_df['WBGT_ref']

    # Save the merged DataFrame back into the dictionary
    dfs[df_key] = merged_df

In [6]:
dfs['df_Texas'].head()

Unnamed: 0,datetime,ta,GHI,rh,va,cloud,forecast,WBGT,ta_ref,GHI_ref,rh_ref,va_ref,cloud_ref,WBGT_ref,ta_error,GHI_error,rh_error,va_error,cloud_error,WBGT_error
0,2023-06-01 12:00:00,29.74075,814.0,56.1,1.814841,37.0,0,27.84409,29.74075,814.0,56.1,1.814841,37.0,27.84409,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-06-01 12:00:00,29.84838,824.0,51.1,3.324882,18.0,3,27.233215,29.74075,814.0,56.1,1.814841,37.0,27.84409,0.10763,10.0,-5.0,1.510041,-19.0,-0.610875
2,2023-06-01 15:00:00,32.01068,926.0,45.7,2.814674,36.0,0,28.431339,32.01068,926.0,45.7,2.814674,36.0,28.431339,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-06-01 15:00:00,31.9407,939.0,44.1,3.461604,24.0,3,28.113057,32.01068,926.0,45.7,2.814674,36.0,28.431339,-0.06998,13.0,-1.6,0.64693,-12.0,-0.318283
4,2023-06-02 12:00:00,30.5541,952.0,54.7,2.253125,21.0,0,28.313648,30.5541,952.0,54.7,2.253125,21.0,28.313648,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
dfs['df_Nevada'].head()

Unnamed: 0,datetime,ta,GHI,rh,va,cloud,forecast,WBGT,ta_ref,GHI_ref,rh_ref,va_ref,cloud_ref,WBGT_ref,ta_error,GHI_error,rh_error,va_error,cloud_error,WBGT_error
0,2023-06-01 12:00:00,29.94076,1024.0,19.0,3.12824,0.0,0,23.158732,29.94076,1024.0,19.0,3.12824,0.0,23.158732,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-06-01 12:00:00,30.88027,1037.0,17.0,3.730766,0.0,3,23.588742,29.94076,1024.0,19.0,3.12824,0.0,23.158732,0.93951,13.0,-2.0,0.602526,0.0,0.43001
2,2023-06-01 15:00:00,31.30654,242.0,17.8,5.066889,0.0,0,22.526311,31.30654,242.0,17.8,5.066889,0.0,22.526311,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-06-01 15:00:00,31.89367,846.0,14.8,7.89971,14.0,3,24.041977,31.30654,242.0,17.8,5.066889,0.0,22.526311,0.58713,604.0,-3.0,2.832821,14.0,1.515666
4,2023-06-02 12:00:00,30.76364,1026.0,20.4,1.815135,0.0,0,24.058988,30.76364,1026.0,20.4,1.815135,0.0,24.058988,0.0,0.0,0.0,0.0,0.0,0.0


### Combined as one dataframe

In [8]:
#Rename each columns
df_T = dfs['df_Texas'][['datetime', 'GHI_error', 'cloud_error', 'forecast', 'ta_error', 'rh_error']]
df_T = df_T.rename(columns={'GHI_error': 'GHI_error_Texas', 'cloud_error': 'cloud_error_Texas', 'ta_error': 'ta_error_Texas', 'rh_error': 'rh_error_Texas'})

df_I = dfs['df_Iowa'][['datetime', 'GHI_error', 'cloud_error', 'forecast', 'ta_error', 'rh_error']]
df_I = df_I.rename(columns={'GHI_error': 'GHI_error_Iowa', 'cloud_error': 'cloud_error_Iowa', 'ta_error': 'ta_error_Iowa', 'rh_error': 'rh_error_Iowa'})

df_N = dfs['df_Nevada'][['datetime', 'GHI_error', 'cloud_error', 'forecast', 'ta_error', 'rh_error']]
df_N = df_N.rename(columns={'GHI_error': 'GHI_error_Nevada', 'cloud_error': 'cloud_error_Nevada', 'ta_error': 'ta_error_Nevada', 'rh_error': 'rh_error_Nevada'})

df_S = dfs['df_Seattle'][['datetime', 'GHI_error', 'cloud_error', 'forecast', 'ta_error', 'rh_error']]
df_S = df_S.rename(columns={'GHI_error': 'GHI_error_Seattle', 'cloud_error': 'cloud_error_Seattle', 'ta_error': 'ta_error_Seattle', 'rh_error': 'rh_error_Seattle'})

# Custom suffixes can be provided to avoid column name conflicts
custom_suffixes = ('_dfT', '_dfI')  # You can choose appropriate suffixes

# Merging df_T and df_I with custom suffixes, on both 'datetime' and 'forecast'
combined_df = pd.merge(df_T, df_I, on=['datetime', 'forecast'], how='inner', suffixes=custom_suffixes)

# Merging with df_N and df_S
# Note: You may need to adjust suffixes again if there are still conflicts
combined_df = pd.merge(combined_df, df_N, on=['datetime', 'forecast'], how='inner')
combined_df = pd.merge(combined_df, df_S, on=['datetime', 'forecast'], how='inner')

combined_df.head()

Unnamed: 0,datetime,GHI_error_Texas,cloud_error_Texas,forecast,ta_error_Texas,rh_error_Texas,GHI_error_Iowa,cloud_error_Iowa,ta_error_Iowa,rh_error_Iowa,GHI_error_Nevada,cloud_error_Nevada,ta_error_Nevada,rh_error_Nevada,GHI_error_Seattle,cloud_error_Seattle,ta_error_Seattle,rh_error_Seattle
0,2023-06-01 12:00:00,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-06-01 12:00:00,10.0,-19.0,3,0.10763,-5.0,78.0,4.0,0.35763,4.3,13.0,0.0,0.93951,-2.0,1.0,-18.0,-1.87299,1.0
2,2023-06-01 15:00:00,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-06-01 15:00:00,13.0,-12.0,3,-0.06998,-1.6,-472.0,21.0,-0.19498,0.7,604.0,14.0,0.58713,-3.0,4.0,0.0,-1.16287,0.8
4,2023-06-02 12:00:00,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#only 3-hour forecast
combined_df = combined_df[combined_df['forecast'] == 3]

In [10]:
combined_df.to_csv('./Result/combined_df.csv', index=False)

#### Data Split for each state

In [11]:
# For Texas
X_Texas = combined_df[['ta_error_Texas', 'rh_error_Texas', 'cloud_error_Texas']]
y_Texas = combined_df['GHI_error_Texas']
X_train_Texas, X_test_Texas, y_train_Texas, y_test_Texas = train_test_split(X_Texas, y_Texas, test_size=0.3, random_state=42)

# For Iowa
X_Iowa = combined_df[['ta_error_Iowa', 'rh_error_Iowa', 'cloud_error_Iowa']]
y_Iowa = combined_df['GHI_error_Iowa']
X_train_Iowa, X_test_Iowa, y_train_Iowa, y_test_Iowa = train_test_split(X_Iowa, y_Iowa, test_size=0.3, random_state=42)

# For Nevada
X_Nevada = combined_df[['ta_error_Nevada', 'rh_error_Nevada', 'cloud_error_Nevada']]
y_Nevada = combined_df['GHI_error_Nevada']
X_train_Nevada, X_test_Nevada, y_train_Nevada, y_test_Nevada = train_test_split(X_Nevada, y_Nevada, test_size=0.3, random_state=42)

# For Seattle
X_Seattle = combined_df[['ta_error_Seattle', 'rh_error_Seattle', 'cloud_error_Seattle']]
y_Seattle = combined_df['GHI_error_Seattle']
X_train_Seattle, X_test_Seattle, y_train_Seattle, y_test_Seattle = train_test_split(X_Seattle, y_Seattle, test_size=0.3, random_state=42)

#### Simple Regression

In [12]:
# Function to identify the most correlated variable and its value
def most_correlated(X, y):
    correlations = X.corrwith(y)
    max_corr_value = correlations.abs().max()
    max_corr_var = correlations.abs().idxmax()
    return max_corr_var, max_corr_value

# Function to train and return a simple linear regression model
def train_regression_model(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

# For each state
# Texas
most_corr_var_texas, max_corr_value_texas = most_correlated(X_train_Texas, y_train_Texas)
print(f'Texas - Most Correlated Variable: {most_corr_var_texas}, Correlation Value: {max_corr_value_texas}')
model_texas = train_regression_model(X_train_Texas[[most_corr_var_texas]], y_train_Texas)

# Iowa
most_corr_var_iowa, max_corr_value_iowa = most_correlated(X_train_Iowa, y_train_Iowa)
print(f'Iowa - Most Correlated Variable: {most_corr_var_iowa}, Correlation Value: {max_corr_value_iowa}')
model_iowa = train_regression_model(X_train_Iowa[[most_corr_var_iowa]], y_train_Iowa)

# Nevada
most_corr_var_nevada, max_corr_value_nevada = most_correlated(X_train_Nevada, y_train_Nevada)
print(f'Nevada - Most Correlated Variable: {most_corr_var_nevada}, Correlation Value: {max_corr_value_nevada}')
model_nevada = train_regression_model(X_train_Nevada[[most_corr_var_nevada]], y_train_Nevada)

# Seattle
most_corr_var_seattle, max_corr_value_seattle = most_correlated(X_train_Seattle, y_train_Seattle)
print(f'Seattle - Most Correlated Variable: {most_corr_var_seattle}, Correlation Value: {max_corr_value_seattle}')
model_seattle = train_regression_model(X_train_Seattle[[most_corr_var_seattle]], y_train_Seattle)

Texas - Most Correlated Variable: ta_error_Texas, Correlation Value: 0.3926941819748447
Iowa - Most Correlated Variable: cloud_error_Iowa, Correlation Value: 0.48988769684754185
Nevada - Most Correlated Variable: ta_error_Nevada, Correlation Value: 0.3298627174281781
Seattle - Most Correlated Variable: ta_error_Seattle, Correlation Value: 0.4245345165187612


In [13]:
print(X_train_Texas.shape)
print(X_test_Texas.shape)

(127, 3)
(55, 3)


In [14]:
def train_regression_model_statsmodels(X, y):
    X = sm.add_constant(X)  # Adding a constant to include an intercept
    model = sm.OLS(y, X).fit()
    return model

# Function to evaluate the model and print p-values
def evaluate_model_statsmodels(model, X_test, y_test):
    X_test = sm.add_constant(X_test)  # Adding a constant for prediction
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    p_values = model.pvalues
    return r2, mse, rmse, mae, p_values

# Create a list to store the results for different states
results = []

# Texas
model_texas = train_regression_model_statsmodels(X_train_Texas[most_corr_var_texas], y_train_Texas)
r2_texas, mse_texas, rmse_texas, mae_texas, p_values_texas = evaluate_model_statsmodels(model_texas, X_test_Texas[most_corr_var_texas], y_test_Texas)
results.append({
    "State": "Texas",
    "Intercept": model_texas.params.iloc[0],
    "Coefficient": model_texas.params.iloc[1],
    "R^2": r2_texas,
    "MSE": mse_texas,
    "RMSE": rmse_texas,
    "MAE": mae_texas,
    "P-values": str(p_values_texas)
})

# Iowa
model_iowa = train_regression_model_statsmodels(X_train_Iowa[most_corr_var_iowa], y_train_Iowa)
r2_iowa, mse_iowa, rmse_iowa, mae_iowa, p_values_iowa = evaluate_model_statsmodels(model_iowa, X_test_Iowa[most_corr_var_iowa], y_test_Iowa)
results.append({
    "State": "Iowa",
    "Intercept": model_iowa.params.iloc[0],
    "Coefficient": model_iowa.params.iloc[1],
    "R^2": r2_iowa,
    "MSE": mse_iowa,
    "RMSE": rmse_iowa,
    "MAE": mae_iowa,
    "P-values": str(p_values_iowa)
})

# Nevada
model_nevada = train_regression_model_statsmodels(X_train_Nevada[most_corr_var_nevada], y_train_Nevada)
r2_nevada, mse_nevada, rmse_nevada, mae_nevada, p_values_nevada = evaluate_model_statsmodels(model_nevada, X_test_Nevada[most_corr_var_nevada], y_test_Nevada)
results.append({
    "State": "Nevada",
    "Intercept": model_nevada.params.iloc[0],
    "Coefficient": model_nevada.params.iloc[1],
    "R^2": r2_nevada,
    "MSE": mse_nevada,
    "RMSE": rmse_nevada,
    "MAE": mae_nevada,
    "P-values": str(p_values_nevada)
})

# Seattle
model_seattle = train_regression_model_statsmodels(X_train_Seattle[most_corr_var_seattle], y_train_Seattle)
r2_seattle, mse_seattle, rmse_seattle, mae_seattle, p_values_seattle = evaluate_model_statsmodels(model_seattle, X_test_Seattle[most_corr_var_seattle], y_test_Seattle)
results.append({
    "State": "Seattle",
    "Intercept": model_seattle.params.iloc[0],
    "Coefficient": model_seattle.params.iloc[1],
    "R^2": r2_seattle,
    "MSE": mse_seattle,
    "RMSE": rmse_seattle,
    "MAE": mae_seattle,
    "P-values": str(p_values_seattle)
})

# Create a DataFrame from the results
model_results_df = pd.DataFrame(results)

# Display the DataFrame
model_results_df = model_results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)

model_results_df


  model_results_df = model_results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)


Unnamed: 0,State,Intercept,Coefficient,R^2,MSE,RMSE,MAE,P-values
0,Texas,0.7,76.7,0.3,25548.2,159.8,85.6,const 0.957082\nta_error_Texas ...
1,Iowa,6.9,-4.7,0.1,42619.1,206.4,115.2,const 7.087835e-01\ncloud_error_...
2,Nevada,3.3,64.8,0.2,32252.8,179.6,113.2,const 0.886780\nta_error_Nevada ...
3,Seattle,91.7,61.5,-0.0,19791.0,140.7,78.3,const 4.588190e-08\nta_error_Sea...


In [15]:
model_results_df.to_csv('./Result/Simple Regression.csv', index=False)

#### Multiple Regression

In [29]:
# Define a function to select features based on correlation
def select_features(X, y, threshold=0.1):
    corr_matrix = X.corrwith(y).abs()
    selected_features = corr_matrix[corr_matrix > threshold].index.tolist()
    selected_corr_matrix = X[selected_features].corr().abs()
    upper_tri = selected_corr_matrix.where(np.triu(np.ones(selected_corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
    selected_features = [feature for feature in selected_features if feature not in to_drop]
    return selected_features

# Define a function to train a model using sklearn
def train_multiple_regression_model(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

# Define a function to train a model using statsmodels
def train_regression_model_statsmodels(X, y):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    return model

# Define a function to evaluate the model and print p-values
def evaluate_model_statsmodels(model, X_test, y_test):
    X_test = sm.add_constant(X_test)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    p_values = model.pvalues
    return r2, mse, rmse, mae, p_values

# Define a function to print model coefficients
def print_model_coefficients(model, selected_features):
    print("Intercept:", model.params.iloc[0])
    for i, coeff in enumerate(model.params.iloc[1:]):
        print(f"Coefficient for {selected_features[i]}:", coeff)

# Initialize dictionaries to store selected features and models for each state
selected_features_dict = {}
state_models_dict = {}

# Process for each state
states = ['Texas', 'Iowa', 'Nevada', 'Seattle']
results = []

for state in states:
    X = combined_df[[f'ta_error_{state}', f'rh_error_{state}', f'cloud_error_{state}']]
    y = combined_df[f'GHI_error_{state}']

    X_train = globals()[f'X_train_{state}']
    y_train = globals()[f'y_train_{state}']
    X_test = globals()[f'X_test_{state}']
    y_test = globals()[f'y_test_{state}']
    
    # Select features and store them
    selected_features = select_features(X_train, y_train)
    selected_features_dict[state] = selected_features
    
    if selected_features:
        # Training using statsmodels
        model_statsmodels = train_regression_model_statsmodels(X_train[selected_features], y_train)
        state_models_dict[state] = model_statsmodels  # Store the model in the dictionary

        # Evaluate and print results for statsmodels model
        r2, mse, rmse, mae, p_values = evaluate_model_statsmodels(model_statsmodels, X_test[selected_features], y_test)
        results.append({
            "State": state,
            "R^2": r2,
            "MSE": mse,
            "RMSE": rmse,
            "MAE": mae,
            "P-values": str(p_values)
        })
        
        print(f"{state} Model (Statsmodels) Evaluation:")
        print(f"R^2: {r2}, MSE: {mse}, RMSE: {rmse}, MAE: {mae}, P-values: {p_values}")
        print_model_coefficients(model_statsmodels, selected_features)
        print("\n")
    else:
        print(f"No suitable features selected for {state}")

# Create a DataFrame from the results
model_results_df = pd.DataFrame(results)

# Display the DataFrame
model_results_df = model_results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)

model_results_df


Texas Model (Statsmodels) Evaluation:
R^2: 0.27853229636576704, MSE: 26991.990115709625, RMSE: 164.29239214190542, MAE: 85.84070485602874, P-values: const                0.416223
ta_error_Texas       0.000014
cloud_error_Texas    0.000508
dtype: float64
Intercept: -10.628728095678934
Coefficient for ta_error_Texas: 69.97261719603048
Coefficient for cloud_error_Texas: -2.788853900830441


Iowa Model (Statsmodels) Evaluation:
R^2: 0.37531896634848505, MSE: 29967.21192886394, RMSE: 173.11040387239566, MAE: 118.66969339469303, P-values: const               2.705978e-01
ta_error_Iowa       1.051462e-07
cloud_error_Iowa    8.904803e-08
dtype: float64
Intercept: -18.949904821383758
Coefficient for ta_error_Iowa: 69.50126083380178
Coefficient for cloud_error_Iowa: -3.897024186049845


Nevada Model (Statsmodels) Evaluation:
R^2: 0.57683658576604, MSE: 17212.035686690193, RMSE: 131.1946480870702, MAE: 97.92563728882779, P-values: const                 0.915020
ta_error_Nevada       0.000042
clou

  model_results_df = model_results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)


Unnamed: 0,State,R^2,MSE,RMSE,MAE,P-values
0,Texas,0.3,26992.0,164.3,85.8,const 0.416223\nta_error_Texas ...
1,Iowa,0.4,29967.2,173.1,118.7,const 2.705978e-01\nta_error_Iow...
2,Nevada,0.6,17212.0,131.2,97.9,const 0.915020\nta_error_Nevad...
3,Seattle,0.1,17055.4,130.6,69.9,const 0.000555\nta_error_Seat...


In [17]:
model_results_df.to_csv('./Result/Multiple Regression.csv', index=False)

In [39]:
import pandas as pd

# Define the data
data = {
    'Model': ['Texas', 'Iowa', 'Nevada', 'Seattle'],
    'R^2': [0.27853229636576704, 0.37531896634848505, 0.57683658576604, 0.1246491738074621],
    'MSE': [26991.990115709625, 29967.21192886394, 17212.035686690193, 17055.370858815062],
    'RMSE': [164.29239214190542, 173.11040387239566, 131.1946480870702, 130.59621303397378],
    'MAE': [85.84070485602874, 118.66969339469303, 97.92563728882779, 69.87083720615968],
    'Intercept': [-10.628728095678934, -18.949904821383758, -2.4375283226425744, 72.38835018141559],
    'Coefficient for ta_error': [69.97261719603048, 69.50126083380178, 69.95444815197565, 54.919386751118076],
    'Coefficient for cloud_error': [-2.788853900830441, -3.897024186049845, -4.042655105001386, -2.2369373491707116],
    'Coefficient for rh_error': [None, None, None, 0.45328860213527244],  # Add None for models without rh_error
}

p_values = {
    'Model': ['Texas', 'Iowa', 'Nevada', 'Seattle'],
    'const': [0.416223, 2.705978e-01, 0.915020, 0.000555],
}

# Create DataFrames
model_df = pd.DataFrame(data)
p_values_df = pd.DataFrame(p_values)

# Merge DataFrames on the 'Model' column
result_df = pd.merge(model_df, p_values_df, on='Model')

# Display the result DataFrame
result_df


Unnamed: 0,Model,R^2,MSE,RMSE,MAE,Intercept,Coefficient for ta_error,Coefficient for cloud_error,Coefficient for rh_error,const
0,Texas,0.278532,26991.990116,164.292392,85.840705,-10.628728,69.972617,-2.788854,,0.416223
1,Iowa,0.375319,29967.211929,173.110404,118.669693,-18.949905,69.501261,-3.897024,,0.270598
2,Nevada,0.576837,17212.035687,131.194648,97.925637,-2.437528,69.954448,-4.042655,,0.91502
3,Seattle,0.124649,17055.370859,130.596213,69.870837,72.38835,54.919387,-2.236937,0.453289,0.000555


#### Model Improvement

#### Feature selections changing thresholds of the correlation for each dataset

In [18]:
# Function to select features based on correlation
def select_features(X, y, threshold):
    corr_matrix = X.corrwith(y).abs()
    selected_features = corr_matrix[corr_matrix > threshold].index.tolist()
    selected_corr_matrix = X[selected_features].corr().abs()
    upper_tri = selected_corr_matrix.where(np.triu(np.ones(selected_corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
    selected_features = [feature for feature in selected_features if feature not in to_drop]
    return selected_features

# Function to train a model using statsmodels
def train_regression_model_statsmodels(X, y):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    return model

# Function to evaluate the model and print p-values
def evaluate_model_statsmodels(model, X_test, y_test):
    X_test = sm.add_constant(X_test)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    p_values = model.pvalues
    return r2, mse, rmse, mae, p_values

# Initialize an empty DataFrame to store results
results_df = pd.DataFrame(columns=['State', 'Threshold', 'R2', 'MSE', 'RMSE', 'MAE', 'Intercept', 'Coefficients', 'Selected Features'])

states = ['Texas', 'Iowa', 'Nevada', 'Seattle']
thresholds = np.arange(0, 1.1, 0.1)

for state in states:
    X = combined_df[[f'ta_error_{state}', f'rh_error_{state}', f'cloud_error_{state}']]
    y = combined_df[f'GHI_error_{state}']

    X_train = globals()[f'X_train_{state}']
    y_train = globals()[f'y_train_{state}']
    X_test = globals()[f'X_test_{state}']
    y_test = globals()[f'y_test_{state}']

    for threshold in thresholds:
        selected_features = select_features(X_train, y_train, threshold)

        if selected_features:
            model_statsmodels = train_regression_model_statsmodels(X_train[selected_features], y_train)
            r2, mse, rmse, mae, _ = evaluate_model_statsmodels(model_statsmodels, X_test[selected_features], y_test)

            intercept = model_statsmodels.params.iloc[0]
            coefficients = model_statsmodels.params.iloc[1:].to_dict()

            new_row = pd.DataFrame({
                'State': [state], 
                'Threshold': [threshold], 
                'R2': [r2], 
                'MSE': [mse], 
                'RMSE': [rmse], 
                'MAE': [mae], 
                'Intercept': [intercept],
                'Coefficients': [coefficients],
                'Selected Features': [', '.join(selected_features)]
            })

            if not new_row.isna().all(axis=1).any():
                results_df = pd.concat([results_df, new_row], ignore_index=True)
        else:
            new_row = pd.DataFrame({
                'State': [state], 
                'Threshold': [threshold], 
                'R2': [np.nan], 
                'MSE': [np.nan], 
                'RMSE': [np.nan], 
                'MAE': [np.nan], 
                'Intercept': [np.nan],
                'Coefficients': ['None'],
                'Selected Features': ['None']
            })

            if not new_row.isna().all(axis=1).any():
                results_df = pd.concat([results_df, new_row], ignore_index=True)

# Display the results DataFrame
results_df

  results_df = pd.concat([results_df, new_row], ignore_index=True)


Unnamed: 0,State,Threshold,R2,MSE,RMSE,MAE,Intercept,Coefficients,Selected Features
0,Texas,0.0,0.278532,26991.990116,164.292392,85.840705,-10.628728,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
1,Texas,0.1,0.278532,26991.990116,164.292392,85.840705,-10.628728,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
2,Texas,0.2,0.278532,26991.990116,164.292392,85.840705,-10.628728,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
3,Texas,0.3,0.278532,26991.990116,164.292392,85.840705,-10.628728,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
4,Texas,0.4,,,,,,,
5,Texas,0.5,,,,,,,
6,Texas,0.6,,,,,,,
7,Texas,0.7,,,,,,,
8,Texas,0.8,,,,,,,
9,Texas,0.9,,,,,,,


In [19]:
# Filter the DataFrame where 'Coefficient' is not 'None'
filtered_results_df = results_df[results_df['Selected Features'] != 'None']

# Format numeric values to represent only below .1
filtered_results_df = filtered_results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)

filtered_results_df

  filtered_results_df = filtered_results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)


Unnamed: 0,State,Threshold,R2,MSE,RMSE,MAE,Intercept,Coefficients,Selected Features
0,Texas,0.0,0.3,26992.0,164.3,85.8,-10.6,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
1,Texas,0.1,0.3,26992.0,164.3,85.8,-10.6,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
2,Texas,0.2,0.3,26992.0,164.3,85.8,-10.6,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
3,Texas,0.3,0.3,26992.0,164.3,85.8,-10.6,"{'ta_error_Texas': 69.97261719603048, 'cloud_e...","ta_error_Texas, cloud_error_Texas"
11,Iowa,0.0,0.4,29967.2,173.1,118.7,-18.9,"{'ta_error_Iowa': 69.50126083380178, 'cloud_er...","ta_error_Iowa, cloud_error_Iowa"
12,Iowa,0.1,0.4,29967.2,173.1,118.7,-18.9,"{'ta_error_Iowa': 69.50126083380178, 'cloud_er...","ta_error_Iowa, cloud_error_Iowa"
13,Iowa,0.2,0.4,29967.2,173.1,118.7,-18.9,"{'ta_error_Iowa': 69.50126083380178, 'cloud_er...","ta_error_Iowa, cloud_error_Iowa"
14,Iowa,0.3,0.4,29967.2,173.1,118.7,-18.9,"{'ta_error_Iowa': 69.50126083380178, 'cloud_er...","ta_error_Iowa, cloud_error_Iowa"
15,Iowa,0.4,0.4,29967.2,173.1,118.7,-18.9,"{'ta_error_Iowa': 69.50126083380178, 'cloud_er...","ta_error_Iowa, cloud_error_Iowa"
22,Nevada,0.0,0.6,17212.0,131.2,97.9,-2.4,"{'ta_error_Nevada': 69.95444815197565, 'cloud_...","ta_error_Nevada, cloud_error_Nevada"


In [20]:
filtered_results_df.to_csv('./Result/different thresholds of multiple regression.csv', index=False)

#### Regularizations

In [21]:
# Function to train a model using Lasso regression
def train_lasso_regression(X, y, alpha):
    model = Lasso(alpha=alpha)
    model.fit(X, y)
    return model

# Function to train a model using Ridge regression
def train_ridge_regression(X, y, alpha):
    model = Ridge(alpha=alpha)
    model.fit(X, y)
    return model

# Function to evaluate scikit-learn models
def evaluate_sklearn_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    return r2, mse, rmse, mae

# Initialize an empty DataFrame to store results
results_df = pd.DataFrame(columns=['State', 'Model', 'Alpha', 'R2', 'MSE', 'RMSE', 'MAE', 'Intercept', 'Coefficients', 'Selected Features'])

states = ['Texas', 'Iowa', 'Nevada', 'Seattle']
alphas = np.logspace(-3, 3, 7)  # Example alpha values

for state in states:
    X = combined_df[[f'ta_error_{state}', f'rh_error_{state}', f'cloud_error_{state}']]
    y = combined_df[f'GHI_error_{state}']

    X_train = globals()[f'X_train_{state}']
    y_train = globals()[f'y_train_{state}']
    X_test = globals()[f'X_test_{state}']
    y_test = globals()[f'y_test_{state}']

    for alpha in alphas:
        # Lasso Regression
        model_lasso = train_lasso_regression(X_train, y_train, alpha)
        r2, mse, rmse, mae = evaluate_sklearn_model(model_lasso, X_test, y_test)
        new_row = pd.DataFrame({
            'State': [state], 
            'Model': ['Lasso'], 
            'Alpha': [alpha],
            'R2': [r2], 
            'MSE': [mse], 
            'RMSE': [rmse], 
            'MAE': [mae], 
            'Intercept': [model_lasso.intercept_],
            'Coefficients': [model_lasso.coef_],
            'Selected Features': [', '.join(X.columns)]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

        # Ridge Regression
        model_ridge = train_ridge_regression(X_train, y_train, alpha)
        r2, mse, rmse, mae = evaluate_sklearn_model(model_ridge, X_test, y_test)
        new_row = pd.DataFrame({
            'State': [state], 
            'Model': ['Ridge'], 
            'Alpha': [alpha],
            'R2': [r2], 
            'MSE': [mse], 
            'RMSE': [rmse], 
            'MAE': [mae], 
            'Intercept': [model_ridge.intercept_],
            'Coefficients': [model_ridge.coef_],
            'Selected Features': [', '.join(X.columns)]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Display the results DataFrame
results_df = results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)

results_df

  results_df = pd.concat([results_df, new_row], ignore_index=True)
  results_df = results_df.applymap(lambda x: f'{x:.1f}' if isinstance(x, (int, float)) else x)


Unnamed: 0,State,Model,Alpha,R2,MSE,RMSE,MAE,Intercept,Coefficients,Selected Features
0,Texas,Lasso,0.0,0.3,26663.3,163.3,85.4,-2.4,"[79.57147762631395, 1.7130328980640872, -2.872...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
1,Texas,Ridge,0.0,0.3,26663.1,163.3,85.4,-2.4,"[79.57474170493133, 1.7134908153147077, -2.872...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
2,Texas,Lasso,0.0,0.3,26666.7,163.3,85.4,-2.5,"[79.50597697521788, 1.70433008668691, -2.87223...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
3,Texas,Ridge,0.0,0.3,26665.1,163.3,85.4,-2.4,"[79.53891557916192, 1.708947362143435, -2.8724...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
4,Texas,Lasso,0.1,0.3,26700.4,163.4,85.5,-2.8,"[78.85177022910622, 1.6174043191342997, -2.868...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
5,Texas,Ridge,0.1,0.3,26684.8,163.4,85.5,-2.6,"[79.18244376390538, 1.6637409488259187, -2.870...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
6,Texas,Lasso,1.0,0.3,27046.8,164.5,85.7,-6.4,"[72.39339546215851, 0.7588568302706027, -2.832...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
7,Texas,Ridge,1.0,0.3,26876.5,163.9,85.5,-4.4,"[75.78804176514808, 1.2333887084318642, -2.855...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
8,Texas,Lasso,10.0,0.2,30797.3,175.5,90.3,-28.2,"[23.952537420208127, -4.95069228589986, -2.613...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"
9,Texas,Ridge,10.0,0.2,28331.3,168.3,87.0,-16.0,"[53.15048259296573, -1.6288121435754075, -2.75...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"


In [41]:
print(np.logspace(-3, 3, 7))

[1.e-03 1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03]


In [22]:
# Group by 'State' and find the index of the row with the minimum MAE and RMSE for each state
idx_min_mae = results_df.groupby('State')['MAE'].idxmin()
idx_min_rmse = results_df.groupby('State')['RMSE'].idxmin()
idx_max_r2 = results_df.groupby('State')['R2'].idxmax()


# Select the rows corresponding to the minimum MAE and RMSE
best_mae_rows = results_df.loc[idx_min_mae]
best_rmse_rows = results_df.loc[idx_min_rmse]
best_r2_rows = results_df.loc[idx_max_r2]

best_mae_rows

Unnamed: 0,State,Model,Alpha,R2,MSE,RMSE,MAE,Intercept,Coefficients,Selected Features
22,Iowa,Lasso,10.0,0.4,30743.3,175.3,117.2,-21.2,"[59.40901020114886, -0.6467009675904671, -3.94...","ta_error_Iowa, rh_error_Iowa, cloud_error_Iowa"
34,Nevada,Lasso,1.0,0.5,18536.1,136.1,100.5,-5.9,"[88.67957031964612, 6.6195069281435215, -4.190...","ta_error_Nevada, rh_error_Nevada, cloud_error_..."
52,Seattle,Lasso,100.0,0.1,17535.8,132.4,67.4,20.3,"[0.0, -3.2348742634562138, -2.512197174658608]","ta_error_Seattle, rh_error_Seattle, cloud_erro..."
12,Texas,Lasso,1000.0,-0.0,37791.0,194.4,100.2,27.7,"[0.0, -0.0, -0.0]","ta_error_Texas, rh_error_Texas, cloud_error_Texas"


In [23]:
best_mae_rows.to_csv('./Result/best_mae_rows.csv', index=False)

In [24]:
best_rmse_rows

Unnamed: 0,State,Model,Alpha,R2,MSE,RMSE,MAE,Intercept,Coefficients,Selected Features
14,Iowa,Lasso,0.0,0.4,29504.4,171.8,117.8,-8.9,"[77.42642402987507, 1.357220133245083, -3.9093...","ta_error_Iowa, rh_error_Iowa, cloud_error_Iowa"
36,Nevada,Lasso,10.0,0.6,18141.9,134.7,97.6,5.5,"[60.99198102292261, 0.0, -3.811399078485044]","ta_error_Nevada, rh_error_Nevada, cloud_error_..."
51,Seattle,Ridge,10.0,0.1,17026.4,130.5,68.9,64.3,"[46.7680231328916, -0.4022281437662174, -2.312...","ta_error_Seattle, rh_error_Seattle, cloud_erro..."
0,Texas,Lasso,0.0,0.3,26663.3,163.3,85.4,-2.4,"[79.57147762631395, 1.7130328980640872, -2.872...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"


In [25]:
best_rmse_rows.to_csv('./Result/best_rmse_rows.csv', index=False)

In [26]:
best_r2_rows

Unnamed: 0,State,Model,Alpha,R2,MSE,RMSE,MAE,Intercept,Coefficients,Selected Features
14,Iowa,Lasso,0.0,0.4,29504.4,171.8,117.8,-8.9,"[77.42642402987507, 1.357220133245083, -3.9093...","ta_error_Iowa, rh_error_Iowa, cloud_error_Iowa"
36,Nevada,Lasso,10.0,0.6,18141.9,134.7,97.6,5.5,"[60.99198102292261, 0.0, -3.811399078485044]","ta_error_Nevada, rh_error_Nevada, cloud_error_..."
42,Seattle,Lasso,0.0,0.1,17055.4,130.6,69.9,72.4,"[54.91736322748216, 0.4530546493788998, -2.236...","ta_error_Seattle, rh_error_Seattle, cloud_erro..."
0,Texas,Lasso,0.0,0.3,26663.3,163.3,85.4,-2.4,"[79.57147762631395, 1.7130328980640872, -2.872...","ta_error_Texas, rh_error_Texas, cloud_error_Texas"


In [27]:
best_r2_rows.to_csv('./Result/best_r2_rows.csv', index=False)