# ANU ASTR4004 2025 - Week 8

Author: Dr Sven Buder (sven.buder@anu.edu.au)

In [None]:
try:
    %matplotlib inline
    %config InlineBackend.figure_format='retina'
except:
    pass

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import patches

# Make the size and fonts larger for this presentation
plt.rcParams['font.size'] = 15
plt.rcParams['legend.fontsize'] = 12

## Intro to PCA

In [None]:
# Let's create a random, but reproducible data set
rng = np.random.RandomState(1)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)
pca.fit(X)

In [None]:
# Compare input data and the PCA-view of the data
fig, ax = plt.subplots(1, 2, figsize=(10,4.5))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

# define a function to draw vectors:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)

# plot data
ax[0].scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v, ax=ax[0])
ax[0].axis('equal');
ax[0].set(xlabel='x', ylabel='y', title='input')

# plot principal components
X_pca = pca.transform(X)
ax[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.2)
draw_vector([0, 0], [0, 3], ax=ax[1])
draw_vector([0, 0], [3, 0], ax=ax[1])
ax[1].set(xlabel='component 1', ylabel='component 2',
          title='principal components',
          xlim=(-5, 5), ylim=(-3, 3.1))
ax[1].axis('equal')

plt.tight_layout()
plt.show()

## GALAH DR4

GALAH DR4 is not yet public (but next Tuesday, 1st Oct 2024!).

You get exclusive access - but without knowing which star measurements belong to.

### Reformatting of table columns

In [None]:
# These are all the elements measured by GALAH
all_elements = [
    'Li', 'C', 'N', 'O', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 
    'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Rb', 'Sr', 'Y', 'Zr', 'Mo', 'Ru', 'Ba', 'La', 'Ce', 'Nd', 'Sm', 'Eu'
]
print('Nr. of elements: ', len(all_elements))

try:

    # Read in the GALAH data table (not public yet)
    data_df = Table.read('/Users/buder/GALAH_DR4/catalogs/galah_dr4_allstar_240705.fits')


    # Li is reported as A(Li). [Li/H] = A(Li) - A(Li)_sun. [Li/Fe] = [Li/H] - [Fe/H]
    data_df['li_h'] = data_df['a_li'] - 1.05
    data_df['li_fe'] = data_df['li_h'] - data_df['fe_h']
    data_df['e_li_fe'] = 0.5*(data_df['e_a_li_low']+data_df['e_a_li_upp'])
    data_df['e_li_h'] = 0.5*(data_df['e_a_li_low']+data_df['e_a_li_upp'])
    data_df['flag_li_h'] = data_df['flag_a_li']
    data_df['flag_li_fe'] = data_df['flag_a_li']

    # For ease: create a few reformatted columns
    for element in elements:
        if element not in ['Li','Fe']:
            data_df[f'{element.lower()}_h'] = data_df[f'{element.lower()}_fe'] + data_df['fe_h']
            data_df[f'flag_{element.lower()}_h'] = data_df[f'flag_{element.lower()}_fe']
            data_df[f'e_{element.lower()}_h'] = data_df[f'e_{element.lower()}_fe']
except:
    print('You do not have access to GALAH DR4.')

### Preparing abundance data of main-sequence and turn-off stars

In [None]:
try:
    # Define the list of elements of interest (column names in the FITS file)
    elements = [
        'Li', # Li is not measured for a lot of stars
    #     'C','N', # C and N not measured in these stars
        'O','Na','Mg','Al','Si','K','Ca','Sc','Ti',
        'V','Cr','Mn','Fe','Co','Ni','Cu','Zn',
    #     'Rb', 'Sr', # Are measured only for a tiny fraction of MSTO stars
        'Y',
    #     'Zr', 'Mo', 'Ru', # Are measured only for a tiny fraction of MSTO stars
        'Ba',
    #     'Ce', 'Nd', 'Sm', 'Eu' # Are measured only for a tiny fraction of MSTO stars
    ]

    # Create [X/H] by adding [X/Fe] + [Fe/H]
    for element in elements:
        if element != 'Fe':
            data_df[f'{element.lower()}_h'] = data_df[f'{element.lower()}_fe'] + data_df['fe_h']
            data_df[f'flag_{element.lower()}_h'] = data_df[f'flag_{element.lower()}_fe']
            data_df[f'e_{element.lower()}_h'] = data_df[f'e_{element.lower()}_fe']

    # Reset wrong flag_fe_h
    data_df['flag_fe_h'] = 0

    unflagged_stars = (
        np.all([data_df[f'flag_{element.lower()}_h'] == 0 for element in elements],axis=0) &
        np.all([data_df[f'e_{element.lower()}_h'] < 0.1 for element in elements],axis=0) &
        (data_df['flag_sp'] == 0) &
        (data_df['snr_px_ccd2'] > 50) &
        (data_df['teff'] < 6500) &
        (data_df['teff'] > 4250) &
        (data_df['logg'] > 3.5)
    )

    print('Remaining stars for '+str(len(elements))+' elements: ', len(data_df['teff'][unflagged_stars]))

    data_msto = data_df[[element.lower()+'_h' for element in elements]][unflagged_stars]
    data_msto.write('ASTR4004_2025/astr4004_2025_week8/data/galah_abundances_msto.csv',overwrite=True)
    
except:
    pass

### Preparing abundance data of giant stars

In [None]:
try:
    # Define the list of elements of interest (column names in the FITS file)
    elements = [
    #     'Li', # Li is not measured for a lot of stars
        'C','N',
        'O','Na','Mg','Al','Si',
    #     'K', # let's neglect this for now
        'Ca','Sc','Ti',
        'V','Cr','Mn','Fe','Co','Ni','Cu','Zn',
    #     'Rb', 'Sr', # Are measured only for a tiny fraction of MSTO stars
        'Y',
    #     'Zr', 'Mo', 'Ru', # Are measured only for a tiny fraction of MSTO stars
        'Ba',
        'Ce', 
        'Nd', 
        'Sm', 
        'Eu' # This is the only r-process element, so let's include it.
    ]

    # Create [X/H] by adding [X/Fe] + [Fe/H]
    for element in elements:
        if element != 'Fe':
            data_df[f'{element.lower()}_h'] = data_df[f'{element.lower()}_fe'] + data_df['fe_h']
            data_df[f'flag_{element.lower()}_h'] = data_df[f'flag_{element.lower()}_fe']
            data_df[f'e_{element.lower()}_h'] = data_df[f'e_{element.lower()}_fe']

    # Reset wrong flag_fe_h
    data_df['flag_fe_h'] = 0

    unflagged_stars = (
        np.all([data_df[f'flag_{element.lower()}_h'] == 0 for element in elements],axis=0) &
        np.all([data_df[f'e_{element.lower()}_h'] < 0.1 for element in elements],axis=0) &
        (data_df['flag_sp'] == 0) &
        (data_df['snr_px_ccd2'] > 50) &
        (data_df['teff'] < 6500) &
        (data_df['teff'] > 4250) &
        (data_df['logg'] < 3.5)
    )

    print('Remaining stars for '+str(len(elements))+' elements: ', len(data_df['teff'][unflagged_stars]))

    data_msto = data_df[[element.lower()+'_h' for element in elements]][unflagged_stars]
    data_msto.write('ASTR4004_2025/astr4004_2025_week8/data/galah_abundances_giants.csv',overwrite=True)
    
except:
    pass

## Abundance dimensionality of main-sequence and turn-off stars (extensive)

### Overview of the Data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Assuming `abundances` is a DataFrame with columns like 'C_H', 'N_H', ..., 'Fe_H'
elements = [
    'Li', # Li is not measured for a lot of stars
#     'C','N', # C and N not measured in these stars
    'O',
    'Na','Mg','Al','Si',
    'K',
    'Ca','Sc','Ti',
    'V','Cr','Mn','Fe','Co','Ni','Cu','Zn',
#     'Rb', 'Sr', # Are measured only for a tiny fraction of MSTO stars
    'Y',
#     'Zr', 'Mo', 'Ru', # Are measured only for a tiny fraction of MSTO stars
    'Ba',
#     'Ce', 'Nd', 'Sm', 'Eu' # Are measured only for a tiny fraction of MSTO stars
]

# Here we assume you have a DataFrame 'abundances' with columns like 'C_H', 'N_H', etc.
abundances = pd.read_csv('data/galah_abundances_msto.csv')

In [None]:
# Get an overview of the data
fig, axs = plt.subplots(5, 5, figsize=(20, 20), sharex=True, sharey=True)
axs = axs.ravel()

# Plot [X/H] vs [Fe/H]
for i, element in enumerate(elements):
    axs[i].scatter(abundances['fe_h'], abundances[f'{element.lower()}_h'], alpha=0.5, s=1)
    axs[i].set_title(f"[{element}/H] vs. [Fe/H]")
    axs[i].set_xlabel("[Fe/H]")
    axs[i].set_ylabel(f"[{element}/H]")

plt.tight_layout()
plt.show()


In [None]:
# Get an overview of the data
fig, axs = plt.subplots(5, 5, figsize=(20, 20), sharex=True, sharey=True)
axs = axs.ravel()

# Plot [X/H] vs [Fe/H]
for i, element in enumerate(elements):
    axs[i].scatter(abundances['fe_h'], abundances[f'{element.lower()}_h'] - abundances['fe_h'], alpha=0.5, s=1)
    axs[i].set_title(f"[{element}/X] vs. [Fe/H]")
    axs[i].set_xlabel("[Fe/H]")
    axs[i].set_ylabel(f"[{element}/X]")

plt.tight_layout()
plt.show()


### Prepare Data

Before applying PCA, ensure the data is cleaned (i.e., missing values handled) and standardized:

In [None]:
from sklearn.preprocessing import StandardScaler

# Replace NaN values with column means (if necessary)
abundances.fillna(abundances.mean(), inplace=True)

# Select only abundance columns (exclude 'Fe_H' if you don't want it in the PCA)
abundance_data = abundances[[f'{element.lower()}_h' for element in elements]]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(abundance_data)

### Apply PCA

We will apply PCA and examine the explained variance:

In [None]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=len(elements) - 1)  # Since 'Fe' is excluded, n_components = 24
pca_result = pca.fit_transform(scaled_data)

# Explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Plot the explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance by Principal Components')
plt.xticks([1,3,5,7,9,11,13,15,17,19])
plt.grid(True)
plt.show()

print(f"Cumulative explained variance: {cumulative_variance}")

This plot helps you determine how many principal components (PCs) are needed to explain most of the variance.  

Typically, you'll want to retain PCs that explain 95% or more of the variance.

### Interpret Principal Components

Now, let's inspect the first few principal components and see how they correlate with the original element abundances:

In [None]:
# Get the loadings (eigenvectors) of the principal components
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=[f'{el}_H' for el in elements])

# Display the loadings of the first few principal components
print(loadings.iloc[:, :3])  # Show loadings for the first 3 PCs

The loadings matrix shows how much each element contributes to the principal components.

You can interpret these as correlations between the PCs and the abundances.

### Understanding Chemical Dimensionality via Visualizations

#### Biplot of the First Two Principal Components

A biplot can help you understand how the first two PCs relate to the original abundances:

In [None]:
# Biplot: plot first two principal components and loadings
f, gs = plt.subplots(1,2,figsize=(12,5))

# Scatter plot of the first two principal components
gs[0].hist2d(pca_result[:, 0], pca_result[:, 1],bins=100,cmap='Greys_r',cmin=1)
gs[1].hist2d(pca_result[:, 0], pca_result[:, 1],bins=100,cmap='Greys',cmin=1)#, alpha=0.5, s=1, c = 'k')

# Plot the loadings (directions of the original variables)
for i, element in enumerate(elements):
    if element != 'Fe':
        gs[1].arrow(0, 0, loadings.iloc[i, 0] * 5, loadings.iloc[i, 1] * 5, color='C0', head_width=0.05)
        gs[1].text(loadings.iloc[i, 0] * 5, loadings.iloc[i, 1] * 5, f'{element}', color='C1')

gs[0].set_xlabel('PC1')
gs[0].set_ylabel('PC2')
gs[1].set_xlabel('PC1')
gs[1].set_ylabel('PC2')
gs[1].set_xlim(-5,5)
gs[1].set_ylim(-1.5,3.0)
plt.grid(True)
plt.tight_layout()
plt.show()

#### Heatmap of the Principal Component Loadings

A heatmap of the PCA loadings can reveal patterns, such as grouping of elements based on nucleosynthesis processes (e.g., α-elements, iron-peak elements):

In [None]:
import seaborn as sns

# Heatmap of the loadings for the first few PCs
plt.figure(figsize=(15, 10))
sns.heatmap(loadings.iloc[:, :10], annot=True, cmap="coolwarm")
plt.title('Heatmap of PCA Loadings (First 10 PCs)')
plt.show()

This heatmap will show which elements are most strongly associated with each PC.

#### Projection onto Principal Components

Finally, you can project stars onto the principal component space to see how they cluster:

In [None]:
f, gs = plt.subplots(1,2,figsize=(15,5))
ax = gs[0]
s = ax.scatter(pca_result[:, 0], pca_result[:, 1], s=5, c=abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Fe/H]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax = gs[1]
s = ax.scatter(pca_result[:, 0], pca_result[:, 1], s=5, c=abundances['li_h']-abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Li/Fe]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

plt.tight_layout()
plt.show()

In [None]:
f, gs = plt.subplots(1,2,figsize=(15,5))
ax = gs[0]
s = ax.scatter(pca_result[:, 0], pca_result[:, 2], s=5, c=abundances['ca_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Ca/H]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax = gs[1]
s = ax.scatter(pca_result[:, 0], pca_result[:, 2], s=5, c=abundances['o_h']-abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[O/Fe]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

plt.tight_layout()
plt.show()

## Abundance dimensionality of main-sequence and turn-off stars

### Overview of the Data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Assuming `abundances` is a DataFrame with columns like 'C_H', 'N_H', ..., 'Fe_H'
elements = [
    'Li', # Li is not measured for a lot of stars
#     'C','N', # C and N not measured in these stars
    'O',
    'Na','Mg','Al','Si',
    'K',
    'Ca','Sc','Ti',
    'V','Cr','Mn','Fe','Co','Ni','Cu','Zn',
#     'Rb', 'Sr', # Are measured only for a tiny fraction of MSTO stars
    'Y',
#     'Zr', 'Mo', 'Ru', # Are measured only for a tiny fraction of MSTO stars
    'Ba',
#     'Ce', 'Nd', 'Sm', 'Eu' # Are measured only for a tiny fraction of MSTO stars
]

# Let's start with only Fe and the alpha-process elements Mg, Si, Ca, Ti, and the s-process elements Y, Ba
elements = [
    'Mg','Si','Ca','Ti',
    'Fe',
    'Y','Ba'
]

# Here we assume you have a DataFrame 'abundances' with columns like 'C_H', 'N_H', etc.
abundances = pd.read_csv('data/galah_abundances_msto.csv')

In [None]:
# Get an overview of the data
fig, axs = plt.subplots(5, 5, figsize=(20, 20), sharex=True, sharey=True)
axs = axs.ravel()

# Plot [X/H] vs [Fe/H]
for i, element in enumerate(elements):
    axs[i].scatter(abundances['fe_h'], abundances[f'{element.lower()}_h'], alpha=0.5, s=1)
    axs[i].set_title(f"[{element}/H] vs. [Fe/H]")
    axs[i].set_xlabel("[Fe/H]")
    axs[i].set_ylabel(f"[{element}/H]")

plt.tight_layout()
plt.show()


In [None]:
# Get an overview of the data
fig, axs = plt.subplots(5, 5, figsize=(20, 20), sharex=True, sharey=True)
axs = axs.ravel()

# Plot [X/H] vs [Fe/H]
for i, element in enumerate(elements):
    axs[i].scatter(abundances['fe_h'], abundances[f'{element.lower()}_h'] - abundances['fe_h'], alpha=0.5, s=1)
    axs[i].set_title(f"[{element}/X] vs. [Fe/H]")
    axs[i].set_xlabel("[Fe/H]")
    axs[i].set_ylabel(f"[{element}/X]")

plt.tight_layout()
plt.show()


### Prepare Data

Before applying PCA, ensure the data is cleaned (i.e., missing values handled) and standardized:

In [None]:
from sklearn.preprocessing import StandardScaler

# Replace NaN values with column means (if necessary)
abundances.fillna(abundances.mean(), inplace=True)

# Select only abundance columns (exclude 'Fe_H' if you don't want it in the PCA)
abundance_data = abundances[[f'{element.lower()}_h' for element in elements]]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(abundance_data)

### Apply PCA

We will apply PCA and examine the explained variance:

In [None]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=len(elements) - 1)  # Since 'Fe' is excluded, n_components = 24
pca_result = pca.fit_transform(scaled_data)

# Explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Plot the explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance by Principal Components')
plt.grid(True)
plt.show()

print(f"Cumulative explained variance: {cumulative_variance}")

This plot helps you determine how many principal components (PCs) are needed to explain most of the variance.  

Typically, you'll want to retain PCs that explain 95% or more of the variance.

### Interpret Principal Components

Now, let's inspect the first few principal components and see how they correlate with the original element abundances:

In [None]:
# Get the loadings (eigenvectors) of the principal components
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=[f'{el}_H' for el in elements])

# Display the loadings of the first few principal components
print(loadings.iloc[:, :3])  # Show loadings for the first 3 PCs

The loadings matrix shows how much each element contributes to the principal components.

You can interpret these as correlations between the PCs and the abundances.

### Understanding Chemical Dimensionality via Visualizations

#### Biplot of the First Two Principal Components

A biplot can help you understand how the first two PCs relate to the original abundances:

In [None]:
# Biplot: plot first two principal components and loadings
f, gs = plt.subplots(1,2,figsize=(12,5))

# Scatter plot of the first two principal components
gs[0].hist2d(pca_result[:, 0], pca_result[:, 1],bins=100,cmap='Greys_r',cmin=1)
gs[1].hist2d(pca_result[:, 0], pca_result[:, 1],bins=100,cmap='Greys',cmin=1)#, alpha=0.5, s=1, c = 'k')

# Plot the loadings (directions of the original variables)
for i, element in enumerate(elements):
    if element != 'Fe':
        gs[1].arrow(0, 0, loadings.iloc[i, 0] * 5, loadings.iloc[i, 1] * 5, color='C0', head_width=0.05)
        gs[1].text(loadings.iloc[i, 0] * 5, loadings.iloc[i, 1] * 5, f'{element}', color='C1')

gs[0].set_xlabel('PC1')
gs[0].set_ylabel('PC2')
gs[1].set_xlabel('PC1')
gs[1].set_ylabel('PC2')
gs[1].set_xlim(-5,5)
gs[1].set_ylim(-1.5,3.0)
plt.grid(True)
plt.tight_layout()
plt.show()

#### Heatmap of the Principal Component Loadings

A heatmap of the PCA loadings can reveal patterns, such as grouping of elements based on nucleosynthesis processes (e.g., α-elements, iron-peak elements):

In [None]:
import seaborn as sns

# Heatmap of the loadings for the first few PCs
plt.figure(figsize=(15, 10))
sns.heatmap(loadings.iloc[:, :10], annot=True, cmap="coolwarm")
plt.title('Heatmap of PCA Loadings (First 5 PCs)')
plt.show()

This heatmap will show which elements are most strongly associated with each PC.

#### Projection onto Principal Components

Finally, you can project stars onto the principal component space to see how they cluster:

In [None]:
f, gs = plt.subplots(1,2,figsize=(15,5))
ax = gs[0]
s = ax.scatter(pca_result[:, 0], pca_result[:, 1], s=5, c=abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Fe/H]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax = gs[1]
s = ax.scatter(pca_result[:, 0], pca_result[:, 1], s=5, c=abundances['ba_h']-abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Ba/Fe]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

plt.tight_layout()
plt.show()

In [None]:
f, gs = plt.subplots(1,2,figsize=(15,5))
ax = gs[0]
s = ax.scatter(pca_result[:, 0], pca_result[:, 2], s=5, c=abundances['ca_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Ca/H]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax = gs[1]
s = ax.scatter(pca_result[:, 0], pca_result[:, 2], s=5, c=abundances['ca_h']-abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Ca/Fe]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

plt.tight_layout()
plt.show()

## Abundance dimensionality of giant stars

### Overview of the Data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Assuming `abundances` is a DataFrame with columns like 'C_H', 'N_H', ..., 'Fe_H'
elements = [
#     'Li', # Li is not measured for a lot of stars
    'C','N',
    'O','Na','Mg','Al','Si','K','Ca','Sc','Ti',
    'V','Cr','Mn','Fe','Co','Ni','Cu','Zn',
#     'Rb', 'Sr', # Are measured only for a tiny fraction of MSTO stars
    'Y',
#     'Zr', 'Mo', 'Ru', # Are measured only for a tiny fraction of MSTO stars
    'Ba',
    'Ce', 
    'Nd', 
    'Sm', 
#     'Eu' # This is the only r-process element, but not well measured
]


# Here we assume you have a DataFrame 'abundances' with columns like 'C_H', 'N_H', etc.
abundances = pd.read_csv('data/galah_abundances_giants.csv')
abundances = abundances[abundances['fe_h'] > -1]

In [None]:
abundances

In [None]:
# Get an overview of the data
fig, axs = plt.subplots(5, 5, figsize=(20, 20), sharex=True, sharey=True)
axs = axs.ravel()

# Plot [X/H] vs [Fe/H]
for i, element in enumerate(elements):
    axs[i].scatter(abundances['fe_h'], abundances[f'{element.lower()}_h'], alpha=0.5, s=1)
    axs[i].set_title(f"[{element}/H] vs. [Fe/H]")
    axs[i].set_xlabel("[Fe/H]")
    axs[i].set_ylabel(f"[{element}/H]")

plt.tight_layout()
plt.show()


In [None]:
# Get an overview of the data
fig, axs = plt.subplots(5, 5, figsize=(20, 20), sharex=True, sharey=True)
axs = axs.ravel()

# Plot [X/H] vs [Fe/H]
for i, element in enumerate(elements):
    axs[i].scatter(abundances['fe_h'], abundances[f'{element.lower()}_h'] - abundances['fe_h'], alpha=0.5, s=1)
    axs[i].set_title(f"[{element}/X] vs. [Fe/H]")
    axs[i].set_xlabel("[Fe/H]")
    axs[i].set_ylabel(f"[{element}/X]")

plt.tight_layout()
plt.show()


### Prepare Data

Before applying PCA, ensure the data is cleaned (i.e., missing values handled) and standardized:

In [None]:
from sklearn.preprocessing import StandardScaler

# Replace NaN values with column means (if necessary)
abundances.fillna(abundances.mean(), inplace=True)

# Select only abundance columns (exclude 'Fe_H' if you don't want it in the PCA)
abundance_data = abundances[[f'{element.lower()}_h' for element in elements]]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(abundance_data)

### Apply PCA

We will apply PCA and examine the explained variance:

In [None]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=len(elements) - 1)  # Since 'Fe' is excluded, n_components = 24
pca_result = pca.fit_transform(scaled_data)

# Explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Plot the explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance by Principal Components')
plt.grid(True)
plt.show()

print(f"Cumulative explained variance: {cumulative_variance}")

This plot helps you determine how many principal components (PCs) are needed to explain most of the variance.  

Typically, you'll want to retain PCs that explain 95% or more of the variance.

### Interpret Principal Components

Now, let's inspect the first few principal components and see how they correlate with the original element abundances:

In [None]:
# Get the loadings (eigenvectors) of the principal components
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=[f'{el}_H' for el in elements])

# Display the loadings of the first few principal components
print(loadings.iloc[:, :3])  # Show loadings for the first 3 PCs

The loadings matrix shows how much each element contributes to the principal components.

You can interpret these as correlations between the PCs and the abundances.

### Understanding Chemical Dimensionality via Visualizations

#### Biplot of the First Two Principal Components

A biplot can help you understand how the first two PCs relate to the original abundances:

In [None]:
# Biplot: plot first two principal components and loadings
f, gs = plt.subplots(1,2,figsize=(12,5))

# Scatter plot of the first two principal components
gs[0].hist2d(pca_result[:, 0], pca_result[:, 1],bins=100,cmap='Greys_r',cmin=1)
gs[1].hist2d(pca_result[:, 0], pca_result[:, 1],bins=100,cmap='Greys',cmin=1)#, alpha=0.5, s=1, c = 'k')

# Plot the loadings (directions of the original variables)
for i, element in enumerate(elements):
    if element != 'Fe':
        gs[1].arrow(0, 0, loadings.iloc[i, 0] * 5, loadings.iloc[i, 1] * 5, color='C0', head_width=0.05)
        gs[1].text(loadings.iloc[i, 0] * 5, loadings.iloc[i, 1] * 5, f'{element}', color='C1')

gs[0].set_xlabel('PC1')
gs[0].set_ylabel('PC2')
gs[1].set_xlabel('PC1')
gs[1].set_ylabel('PC2')
gs[1].set_xlim(-5,5)
gs[1].set_ylim(-1.5,3.0)
plt.grid(True)
plt.tight_layout()
plt.show()

#### Heatmap of the Principal Component Loadings

A heatmap of the PCA loadings can reveal patterns, such as grouping of elements based on nucleosynthesis processes (e.g., α-elements, iron-peak elements):

In [None]:
import seaborn as sns

# Heatmap of the loadings for the first few PCs
plt.figure(figsize=(15, 10))
sns.heatmap(loadings.iloc[:, :10], annot=True, cmap="coolwarm")
plt.title('Heatmap of PCA Loadings (First 5 PCs)')
plt.show()

This heatmap will show which elements are most strongly associated with each PC.

#### Projection onto Principal Components

Finally, you can project stars onto the principal component space to see how they cluster:

In [None]:
f, gs = plt.subplots(1,2,figsize=(15,5))
ax = gs[0]
s = ax.scatter(pca_result[:, 0], pca_result[:, 1], s=5, c=abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Fe/H]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax = gs[1]
s = ax.scatter(pca_result[:, 0], pca_result[:, 1], s=5, c=abundances['ba_h']-abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Ba/Fe]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

plt.tight_layout()
plt.show()

In [None]:
f, gs = plt.subplots(1,2,figsize=(15,5))
ax = gs[0]
s = ax.scatter(pca_result[:, 0], pca_result[:, 2], s=5, c=abundances['al_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Al/H]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax = gs[1]
s = ax.scatter(pca_result[:, 0], pca_result[:, 2], s=5, c=abundances['al_h']-abundances['fe_h'], cmap='RdYlBu')
plt.colorbar(s, ax=ax,label='[Al/Fe]')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

plt.tight_layout()
plt.show()