In [13]:
import pandas as pd
from scipy.spatial import distance
from sklearn.manifold import TSNE
import plotly.express as px
pd.set_option('display.float_format', lambda x: '%.10f' % x)

## Clean data for energy profile comparison

In [14]:
# Import data and rename columns for better readability
data = pd.read_csv('Data/annual_generation_state.csv', thousands=',')
data.rename(columns={'TYPE OF PRODUCER': 'PRODUCER', 'ENERGY SOURCE': 'SOURCE', 'GENERATION (Megawatthours)': 'MWH'}, inplace=True)

# Change all strings to upper for consistency (state column has 'US-Total' and 'US-TOTAL')
data['STATE'] = data['STATE'].str.upper()
data['PRODUCER'] = data['PRODUCER'].str.upper()
data['SOURCE'] = data['SOURCE'].str.upper()

# Filter data for 2016 and only include 'Total Electric Power Industry'
data_2016 = data[(data['YEAR'] == 2016) & (data['PRODUCER'] == 'TOTAL ELECTRIC POWER INDUSTRY')]

# Group and pivot the data to have energy sources as columns
data_2016 = data_2016.groupby(['STATE', 'SOURCE'])['MWH'].sum().unstack('SOURCE', fill_value=0)

# Convert to percentages for normalization (multiplied by 100 for readability)
for col in data_2016:
    if col != 'TOTAL':
        data_2016[col] = data_2016[col] / data_2016['TOTAL'] * 100

# Change name of 'total' column to 'total (mwh)' for readability
data_2016.rename(columns={'TOTAL': 'TOTAL (MWH)'}, inplace=True)
cols = list(data_2016.columns)
cols.append(cols.pop(cols.index('TOTAL (MWH)')))
data_2016 = data_2016[cols]

## Calculate the Euclidean distance between the energy profiles to find those nearest to Iowa's
> Calculating only using percentage of total energy from each of the 13 sources (not adding the total mwh produced as an extra dimension)

In [15]:

# Create dataframe without the total column for euclidean distance calculation 
data_2016_totdrop = data_2016.drop(columns=['TOTAL (MWH)'])

# Compute the Euclidean distance between each state with reference to Iowa
distances = {}
for state, row in data_2016_totdrop.iterrows():
    distances[state] = distance.euclidean(data_2016_totdrop.loc['IA'], row)

# Sort the states by distance from Iowa
sorted_states = sorted(distances.items(), key=lambda x: x[1])

# Print the top 10 states with similar energy profiles to Iowa
print('Top 10 states with similar energy profiles to Iowa:')
for i in range(10):
    e_profile = data_2016.loc[sorted_states[i][0]]
    print(f'State: {sorted_states[i][0]}, Distance: {sorted_states[i][1]}\n')

Top 10 states with similar energy profiles to Iowa:
State: IA, Distance: 0.0

State: KS, Distance: 11.710686142757355

State: MN, Distance: 27.808705734674408

State: CO, Distance: 29.293183070381758

State: ND, Distance: 29.974189578520956

State: NE, Distance: 34.526524189715445

State: NM, Distance: 38.21544942264555

State: WI, Distance: 40.19425258545462

State: OH, Distance: 42.65703391629518

State: IN, Distance: 43.99489450834566



## Calculate the Euclidean distance between the energy profiles to find those nearest to Iowa's
> Calculating using the total mwh produced as an extra dimension

In [16]:
# Compute the Euclidean distance between each state with reference to Iowa
distances = {}
for state, row in data_2016.iterrows():
    distances[state] = distance.euclidean(data_2016.loc['IA'], row)

# Sort the states by distance from Iowa
sorted_states = sorted(distances.items(), key=lambda x: x[1])

# Print the top 10 states with similar energy profiles to Iowa
print('Top 10 states with similar energy profiles to Iowa:')
for i in range(10):
    e_profile = data_2016.loc[sorted_states[i][0]]
    print(f'State: {sorted_states[i][0]}, Distance: {sorted_states[i][1]}\n')

Top 10 states with similar energy profiles to Iowa:
State: IA, Distance: 0.0

State: CO, Distance: 25973.0165188908

State: MN, Distance: 5086246.00007602

State: OR, Distance: 5789506.000524245

State: AR, Distance: 6052552.000183591

State: KS, Distance: 6792516.000010095

State: WY, Distance: 7735877.0001553325

State: MS, Distance: 8488788.000489555

State: WI, Distance: 10574104.000076393

State: NV, Distance: 14605502.000263866



## Visualize the Euclidean distances using t-SNE to reduce dimensionality
> Without total mwh produced as an extra dimension

In [17]:
# Use t-SNE to reduce dimensionality to 3D
tsne = TSNE(n_components=3, random_state=42)
reduced_data_tsne = tsne.fit_transform(data_2016_totdrop)

# Create a dataframe with the reduced data
reduced_df_tsne = pd.DataFrame(reduced_data_tsne, columns=['x', 'y', 'z'])
reduced_df_tsne['State'] = data_2016_totdrop.index

# Create interactive plotly scatter plot
fig = px.scatter_3d(reduced_df_tsne, x='x', y='y', z='z', text='State', hover_name='State', 
                    color_continuous_scale='viridis', opacity=0.8,
                    labels={'x': 't-SNE 1', 'y': 't-SNE 2', 'z': 't-SNE 3', 'color': 'Coal %'},
                 title='t-SNE of Energy Sources Profiles in 2016 (Excluding Total MWH Produced)')

fig.update_traces(marker=dict(size=3), textfont=dict(size=5))
fig.show()

## Visualize the Euclidean distances using t-SNE to reduce dimensionality
> With total mwh produced as an extra dimension

In [18]:
# Use t-SNE to reduce dimensionality to 3D
tsne = TSNE(n_components=3, random_state=42)
reduced_data_tsne = tsne.fit_transform(data_2016)

# Create a dataframe with the reduced data
reduced_df_tsne = pd.DataFrame(reduced_data_tsne, columns=['x', 'y', 'z'])
reduced_df_tsne['State'] = data_2016.index

# Create interactive plotly scatter plot
fig = px.scatter_3d(reduced_df_tsne, x='x', y='y', z='z', text='State', hover_name='State', 
                    color_continuous_scale='viridis', opacity=0.8,
                    labels={'x': 't-SNE 1', 'y': 't-SNE 2', 'z': 't-SNE 3', 'color': 'Coal %'},
                 title='t-SNE of Energy Sources Profiles in 2016 (Including Total MWH Produced))')

fig.update_traces(marker=dict(size=3), textfont=dict(size=5))
fig.show()