In [2]:
import sklearn
import pandas as pd
import numpy
import xgboost
import sklearn.model_selection as ms
from sklearn import metrics
import xgboost as xgb

In [5]:
def XGB_accuracy(X, Y): 
	# Split for training and testing
	x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.2, random_state=0)

	# Fit the decision tree
	classifier = xgb.XGBClassifier(objective="binary:logistic", min_child_weight=10,eval_metric=["merror", "mlogloss"], max_depth= 5, n_estimators=1000,random_state=42) 
	# classifier = xgb.XGBClassifier(objective="multi:softprob", min_child_wight=10, max_depth=3, n_estimators=500) # For test purposes
	classifier = classifier.fit(x_train, y_train, verbose=False) 
	# Predictions
	y_pred = classifier.predict(x_test)
	return metrics.accuracy_score(y_test, y_pred)


In [6]:
def XGBRF_accuracy(X, Y): 
	# Split for training and testing
	x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.2, random_state=0)
	# Fit the decision tree
	regressor = xgb.XGBRFRegressor(min_child_weight=10,objective='reg:squarederror',eval_metric=["merror", "mlogloss"], max_depth= 5, n_estimators=1000,random_state=42) 
	# classifier = xgb.XGBClassifier(objective="multi:softprob", min_child_wight=10, max_depth=3, n_estimators=500) # For test purposes
	regressor= regressor.fit(x_train, y_train, verbose=False) 
	# Predictions
	y_pred = regressor.predict(x_test)
	return regressor.score(y_test, y_pred)

In [11]:
# Data from Fisher et al 2025
abiotic_flux=pd.read_csv('Archean Earth flux network metrics, no life.csv') # Abiotic case
abiotic_steady_state=pd.read_csv('Archean Earth steady state network metrics, no life.csv') # Weird steady state abiotic case
biotic_flux=pd.read_csv('Archean Earth flux network metrics, with life.csv') # Biotic case
anomalous_high_flux=pd.read_csv('Archean Earth agnostic high flux network metrics, no life.csv') # Weird high flux abiotic case
exo_combined=pd.concat([abiotic_flux,abiotic_steady_state,biotic_flux,anomalous_high_flux])


In [9]:
exo_metrics=['CH4 abundance','Mean degree','Average shortest path length','Node betweenness centrality','Clustering coefficient']
exo_target=list(exo_combined['Has life?'])

for metric in exo_metrics:
	accuracy=XGB_accuracy(exo_combined[metric],exo_target)
	print("The accuracy of "+ metric+" is "+str(accuracy))


The accuracy of CH4 abundance is 0.8854679802955665
The accuracy of Mean degree is 0.9451970443349754
The accuracy of Average shortest path length is 0.9433497536945813
The accuracy of Node betweenness centrality is 0.8971674876847291
The accuracy of Clustering coefficient is 0.8195812807881774


In [8]:
# Looking promising!
# Now let's try with spectral data

exo_data=pd.read_csv('exo_data.csv')
exo_metrics=['CH4 abundance','Mean degree','Average shortest path length','Node betweenness centrality','Clustering coefficient']
exo_spec=exo_data[['CH4 abundance','Mean degree','Average shortest path length','Node betweenness centrality','Clustering coefficient']]
exo_target=list(exo_data['CFOS'])

for metric in exo_metrics:
	accuracy=XGBRF_accuracy(exo_spec[metric],exo_target)
	print("The R2 of "+ metric+" is "+str(accuracy))


The R2 of CH4 abundance is -1.0852696895599365
The R2 of Mean degree is -0.07874584197998047
The R2 of Average shortest path length is -0.1583791971206665
The R2 of Node betweenness centrality is -0.2837355136871338
The R2 of Clustering coefficient is -0.16504335403442383


In [7]:
# not looking great...let's try with spectral variance

exo_data=pd.read_csv('exo_data.csv')
exo_metrics=['CH4 abundance','Mean degree','Average shortest path length','Node betweenness centrality','Clustering coefficient']
exo_spec=exo_data[['CH4 abundance','Mean degree','Average shortest path length','Node betweenness centrality','Clustering coefficient']]
exo_target=list(exo_data['Spectral variance'])

for metric in exo_metrics:
	accuracy=XGBRF_accuracy(exo_spec[metric],exo_target)
	print("The R2 of "+ metric+" is "+str(accuracy))

The R2 of CH4 abundance is -0.8667113780975342
The R2 of Mean degree is -0.10064923763275146
The R2 of Average shortest path length is -0.09990060329437256
The R2 of Node betweenness centrality is -0.2825467586517334
The R2 of Clustering coefficient is -0.17543518543243408
