In [2]:
import pandas as pd
from stepmix.stepmix import StepMix
from sklearn.metrics import rand_score

In [2]:
# Goal: determine if latent class analysis (LCA) using stepmix can differentiate between abiotic and biotic worlds
# using atmospheric chemical reaction network topological metrics

In [4]:
# import exoplanet data; see Fisher et al 2025 for more info
abiotic_flux=pd.read_csv('Archean Earth flux network metrics, no life.csv') #typical abiotic case
biotic_flux=pd.read_csv('Archean Earth flux network metrics, with life.csv') #biotic case
abiotic_steady_state=pd.read_csv('Archean Earth steady state network metrics, no life.csv') #weird abiotic case
anomalous_high_flux=pd.read_csv('Archean Earth agnostic high flux network metrics, no life.csv') # second weird abiotic case

In [5]:
# first, let's analyze the typical abiotic case and the biotic case

In [12]:
exo_combined=pd.concat([abiotic_flux,biotic_flux])
exo_data=exo_combined[['Mean degree','Average shortest path length','CH4 abundance']]

In [13]:
model = StepMix(n_components=2, measurement="categorical",verbose=1)
model.fit(exo_data)

Fitting StepMix...


Initializations (n_init) : 100%|█| 1/1 [00:00<00:00,  9.58it/s, max_LL=-1.58e+4,

MODEL REPORT
    Measurement model parameters
          model_name                            categorical        
          class_no                                        0       1
          param variable                                           
          pis   Average shortest path length_10      0.0132  0.3288
                Average shortest path length_11      0.0717  0.1488
                Average shortest path length_12      0.2362  0.8100
                Average shortest path length_13      0.9998  0.1595
                Average shortest path length_14      0.2606  0.0000
                Average shortest path length_4       0.0000  0.0009
                Average shortest path length_5       0.0000  0.0014
                Average shortest path length_6       0.0000  0.0017
                Average shortest path length_7       0.0000  0.0068
                Average shortest path length_8       0.0020  0.1090
                Average shortest path length_9       0.0090  0.4101
  




In [14]:
#calculate predictions based on LCA weights and stats
exo_data['exo_predict']=model.predict(exo_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exo_data['exo_predict']=model.predict(exo_data)


In [15]:
#Now let's see how accurate the predictions turned out to be, first using crosstabs
exo_crosstabs=pd.crosstab(exo_data['exo_predict'],exo_combined['Has life?'])
print(exo_crosstabs)

Has life?       0     1
exo_predict            
0             466  1984
1            4210    41


In [16]:
#And then by calculating the Rand score
rand=rand_score(exo_data['exo_predict'],exo_combined['Has life?'])
print(rand)

0.8601074021030499


In [6]:
#Not bad! Let's make it a little more interesting by including the weird steady state abiotic cases

In [3]:
exo_combined=pd.concat([abiotic_flux,biotic_flux,abiotic_steady_state,anomalous_high_flux])
exo_data=exo_combined[['Mean degree','Average shortest path length','CH4 abundance']]

NameError: name 'abiotic_flux' is not defined

In [8]:
model = StepMix(n_components=2, measurement="categorical",verbose=1)
model.fit(exo_data)

Fitting StepMix...


Initializations (n_init) : 100%|█| 1/1 [00:00<00:00,  2.00it/s, max_LL=-2.13e+4,

MODEL REPORT
    Measurement model parameters
          model_name                            categorical        
          class_no                                        0       1
          param variable                                           
          pis   Average shortest path length_10      0.0809  0.3495
                Average shortest path length_11      0.2566  0.0502
                Average shortest path length_12      0.1541  0.9192
                Average shortest path length_13      0.9016  0.0573
                Average shortest path length_14      0.2472  0.0000
                Average shortest path length_4       0.0009  0.0000
                Average shortest path length_5       0.0014  0.0000
                Average shortest path length_6       0.0016  0.0000
                Average shortest path length_7       0.0016  0.0059
                Average shortest path length_8       0.0041  0.1208
                Average shortest path length_9       0.0079  0.4707
  




In [9]:
# Generate predictions
exo_data['exo_predict']=model.predict(exo_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exo_data['exo_predict']=model.predict(exo_data)


In [11]:
# Now let's see how we did

exo_crosstabs=pd.crosstab(exo_data['exo_predict'],exo_combined['Has life?'])

print(exo_crosstabs)

rand=rand_score(exo_data['exo_predict'],exo_combined['Has life?'])

print(rand)


Has life?       0     1
exo_predict            
0            1624  2690
1            3770    33
0.6750268816828889


In [12]:
# Hmmmm...not as good. But we know from Fisher et al 2025 that CH4 abundance can actually be confounding when
# comparing the abiotic steady state case and the biotic case. Let's try using a different metric

In [13]:
exo_data=exo_combined[['Mean degree','Average shortest path length','Clustering coefficient']]
model = StepMix(n_components=2, measurement="categorical",verbose=1)
model.fit(exo_data)

Fitting StepMix...


Initializations (n_init) : 100%|█| 1/1 [00:00<00:00,  3.32it/s, max_LL=-1.22e+4,

MODEL REPORT
    Measurement model parameters
          model_name                            categorical        
          class_no                                        0       1
          param variable                                           
          pis   Average shortest path length_10      0.1339  0.0000
                Average shortest path length_11      0.8675  1.0000
                Average shortest path length_12      0.9592  0.0000
                Average shortest path length_13      0.0000  1.0000
                Average shortest path length_9       0.0088  0.0000
                Mean degree_11                       0.0196  0.0000
                Mean degree_12                       0.2208  0.0000
                Mean degree_13                       0.7596  0.1832
                Mean degree_14                       0.0000  0.8168
    Class weights
        Class 1 : 0.53
        Class 2 : 0.47
    Fit for 2 latent classes
    Estimation method             : 1-step
  




In [14]:
exo_data['exo_predict']=model.predict(exo_data)

exo_crosstabs=pd.crosstab(exo_data['exo_predict'],exo_combined['Has life?'])

print(exo_crosstabs)

rand=rand_score(exo_data['exo_predict'],exo_combined['Has life?'])

print(rand)

Has life?       0     1
exo_predict            
0            4081   258
1            1313  2465
0.6877915901332854


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exo_data['exo_predict']=model.predict(exo_data)


In [5]:
# Well, that's a little better
# Next step: investigating relationships between spectral features and network metrics

exo_data=pd.read_csv('exo_data.csv')
exo_spec=exo_data[['Mean degree','Average shortest path length','CH4 abundance','CFOS']]
model = StepMix(n_components=2, measurement="continuous",verbose=1)
model.fit(exo_spec)

Fitting StepMix...


Initializations (n_init) : 100%|█| 1/1 [00:00<00:00, 31.90it/s, max_LL=-3.31e+4,

MODEL REPORT
    Measurement model parameters
          model_name                               gaussian_diag              
          class_no                                             0             1
          param       variable                                                
          covariances Average shortest path length  2.072000e-01  6.494000e-01
                      CFOS                          5.039984e+08  1.671135e+08
                      CH4 abundance                 1.688378e+30  1.119297e+19
                      Mean degree                   8.450000e-02  3.108000e-01
          means       Average shortest path length  9.196400e+00  9.607500e+00
                      CFOS                          2.859084e+04  1.437408e+04
                      CH4 abundance                 1.988731e+14  1.339298e+09
                      Mean degree                   1.088890e+01  1.100540e+01
    Class weights
        Class 1 : 0.28
        Class 2 : 0.72
    Fit for 2 latent 




In [5]:
exo_spec['exo_predict']=model.predict(exo_spec)

exo_crosstabs=pd.crosstab(exo_spec['exo_predict'],exo_spec['CFOS'])

print(exo_crosstabs)

rand=rand_score(exo_spec['exo_predict'],exo_spec['CFOS'])

print(rand)

CFOS         9483.728434   9684.319246   9699.614554   9719.139278   \
exo_predict                                                           
0                       0             0             1             2   
1                       1             3             0             0   

CFOS         9721.905443   9723.534760   9726.792440   9735.097210   \
exo_predict                                                           
0                       0             0             0             0   
1                       1             1             1             1   

CFOS         9738.529733   9750.914025   ...  58988.833876  58988.834087  \
exo_predict                              ...                               
0                       0             0  ...             1             0   
1                       1             1  ...             0             1   

CFOS         58988.834310  58988.834635  58988.834817  58988.845803  \
exo_predict                                           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exo_spec['exo_predict']=model.predict(exo_spec)


In [6]:
# Well, that's underwhelming--let's try with the spectral variance, instead

exo_spec=exo_data[['Mean degree','Average shortest path length','CH4 abundance','Spectral variance']]
model = StepMix(n_components=2, measurement="continuous",verbose=1)
model.fit(exo_spec)

Fitting StepMix...


Initializations (n_init) : 100%|█| 1/1 [00:00<00:00, 38.30it/s, max_LL=-3.05e+4,

MODEL REPORT
    Measurement model parameters
          model_name                               gaussian_diag              
          class_no                                             0             1
          param       variable                                                
          covariances Average shortest path length  2.072000e-01  6.493000e-01
                      CH4 abundance                 1.688604e+30  1.119960e+19
                      Mean degree                   8.450000e-02  3.108000e-01
                      Spectral variance             9.292790e+05  3.102234e+05
          means       Average shortest path length  9.196400e+00  9.607400e+00
                      CH4 abundance                 1.989004e+14  1.339917e+09
                      Mean degree                   1.088890e+01  1.100540e+01
                      Spectral variance             8.164094e+02  2.125341e+02
    Class weights
        Class 1 : 0.28
        Class 2 : 0.72
    Fit for 2 latent 




In [7]:
exo_spec['exo_predict']=model.predict(exo_spec)

exo_crosstabs=pd.crosstab(exo_spec['exo_predict'],exo_spec['Spectral variance'])

print(exo_crosstabs)

rand=rand_score(exo_spec['exo_predict'],exo_spec['Spectral variance'])

print(rand)

Spectral variance  31.524045    31.886747    31.904119    32.116366    \
exo_predict                                                             
0                            0            1            1            0   
1                            1            1            0            1   

Spectral variance  32.309558    32.770728    32.835443    32.884138    \
exo_predict                                                             
0                            0            0            1            1   
1                            1            1            0            0   

Spectral variance  33.103554    33.120066    ...  2138.160053  2138.160054  \
exo_predict                                  ...                             
0                            1            0  ...            1            1   
1                            0            1  ...            0            0   

Spectral variance  2138.160054  2138.160058  2138.160061  2138.160066  \
exo_predict                 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exo_spec['exo_predict']=model.predict(exo_spec)
