In [61]:
import pandas as pd
from scipy.spatial import distance
from sklearn.manifold import TSNE
import plotly.express as px

## Clean data for energy profile comparison

In [62]:
# Import data and rename columns for better readability
data = pd.read_csv('Data/annual_generation_state.csv', thousands=',')
data.rename(columns={'TYPE OF PRODUCER': 'PRODUCER', 'ENERGY SOURCE': 'SOURCE', 'GENERATION (Megawatthours)': 'MWH'}, inplace=True)

# Change all strings to upper for consistency (state column has 'US-Total' and 'US-TOTAL')
data['STATE'] = data['STATE'].str.upper()
data['PRODUCER'] = data['PRODUCER'].str.upper()
data['SOURCE'] = data['SOURCE'].str.upper()

# Filter data for 2016 and only include 'Total Electric Power Industry'
data_2016 = data[(data['YEAR'] == 2016) & (data['PRODUCER'] == 'TOTAL ELECTRIC POWER INDUSTRY')]

# Group and pivot the data to have energy sources as columns
data_2016 = data_2016.groupby(['STATE', 'SOURCE'])['MWH'].sum().unstack('SOURCE', fill_value=0)

# Convert to percentages for normalization (multiplied by 100 for readability)
for col in data_2016:
    if col != 'TOTAL':
        data_2016[col] = data_2016[col] / data_2016['TOTAL'] * 100

# Drop the total column 
# Can try keeping later to find states with similar levels of production as well
data_2016.drop(columns=['TOTAL'], inplace=True)

## Calculate the Euclidean distance between the energy profiles to find those nearest to Iowa

In [63]:
# Compute the Euclidean distance between each state with reference to Iowa
distances = {}
for state, row in data_2016.iterrows():
    distances[state] = distance.euclidean(data_2016.loc['IA'], row)

# Sort the states by distance from Iowa
sorted_states = sorted(distances.items(), key=lambda x: x[1])
print('Top 10 states with similar energy profiles to Iowa:')
for i in range(10):
    print(f'State: {sorted_states[i][0]}, Distance: {sorted_states[i][1]}')

Top 10 states with similar energy profiles to Iowa:
State: IA, Distance: 0.0
State: KS, Distance: 11.710686142757355
State: MN, Distance: 27.808705734674408
State: CO, Distance: 29.293183070381758
State: ND, Distance: 29.974189578520956
State: NE, Distance: 34.526524189715445
State: NM, Distance: 38.21544942264555
State: WI, Distance: 40.19425258545462
State: OH, Distance: 42.65703391629518
State: IN, Distance: 43.99489450834566


## Visualize the Euclidean distances using t-SNE to reduce dimensionality

In [64]:
# Use t-SNE to reduce dimensionality to 2D
tsne = TSNE(n_components=3, random_state=42)
reduced_data_tsne = tsne.fit_transform(data_2016)

# Create a dataframe with the reduced data
reduced_df_tsne = pd.DataFrame(reduced_data_tsne, columns=['x', 'y', 'z'])
reduced_df_tsne['State'] = data_2016.index

# Create interactive plotly scatter plot
color_scale = data_2016['COAL']
fig = px.scatter_3d(reduced_df_tsne, x='x', y='y', z='z', text='State', hover_name='State', 
                    color=color_scale, color_continuous_scale='viridis', opacity=0.8,
                    labels={'x': 't-SNE 1', 'y': 't-SNE 2', 'z': 't-SNE 3', 'color': 'Coal %'},
                 title='t-SNE of Energy Sources Profiles in 2016')

fig.update_traces(marker=dict(size=3), textfont=dict(size=5))
fig.update_layout(coloraxis_colorbar=dict(title='Coal %', yanchor='top', y=0.95, xanchor='left', x=0.05))
fig.show()