In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# We load the data here - The dataframe should have 1 column "subject" and one column "visitcode"
# the rest of the columns will be the metabolome profiles

df = pd.read_csv("/path/to/raw/metabolomics/file.csv")

# We will drop the 'subject' and 'visitcode' columns at this stage - we will add them later again
df = df.drop(['subject', 'visitcode'], axis=1)

# We apply log2 transformation to the data (adding 1 to avoid log(0))
df_transformed = np.log2(df + 1)

# We perform PCA here
pca = PCA(n_components=20)
pc_scores = pca.fit_transform(df_transformed)

# We save the PCs to a new dataframe - pc_df
pc_df = pd.DataFrame(data = pc_scores, columns = ['PC' + str(i) for i in range(1, 21)])

# We also calculate the variance explained and cumulative variance explained by PCs
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

for i, cum_var in enumerate(cumulative_variance, start=1):
    print(f"Cumulative variance explained by PC{i}: {cum_var}")

# This is to make a plot for the explained variance by PCs
plt.figure(figsize=(10, 7))
plt.bar(range(1, 21), explained_variance, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1, 21), cumulative_variance, where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.xticks(range(1, 21))
plt.legend(loc='best')
plt.tight_layout()
plt.show()

# We load the subject and visitcode data from the original file
subject_visitcode_df = pd.read_csv("/path/to/raw/metabolomics/file.csv", usecols=['subject', 'visitcode'])

# We add the subject and visitcode columns to the pc_df DataFrame
pc_df = pd.concat([subject_visitcode_df, pc_df], axis=1)

# we save the pc_df as a csv file
pc_df.to_csv("/out/pcs.csv", index=False)


# for github

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# We load the data here - The dataframe may have multiple metadata columns (e.g., sample IDs) and metabolome peaks

df = pd.read_csv("/path/to/raw/metabolomics/file.csv")

# Define metadata columns to remove (e.g., sample IDs)
metadata_columns = ["{meta_data1}", "{meta_data2}", "...", "{meta_dataN}"]
df = df.drop(metadata_columns, axis=1)

# We apply log2 transformation to the data (adding 1 to avoid log(0))
df_transformed = np.log2(df + 1)

# We perform PCA here
pca = PCA(n_components=20)
pc_scores = pca.fit_transform(df_transformed)

# Saving the PCs to a new dataframe - pc_df
pc_df = pd.DataFrame(data=pc_scores, columns=['PC' + str(i) for i in range(1, 21)])

# Calculation of the variance explained and cumulative variance explained by PCs
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

for i, cum_var in enumerate(cumulative_variance, start=1):
    print(f"Cumulative variance explained by PC{i}: {cum_var}")

# This is to make a plot for the explained variance by PCs
plt.figure(figsize=(10, 7))
plt.bar(range(1, 21), explained_variance, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1, 21), cumulative_variance, where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.xticks(range(1, 21))
plt.legend(loc='best')
plt.tight_layout()
plt.show()

# We reload only the metadata columns to add them back after PCA
metadata_df = pd.read_csv("/path/to/raw/metabolomics/file.csv", usecols=metadata_columns)
pc_df = pd.concat([metadata_df, pc_df], axis=1)

# Saving the pc_df as a CSV file
pc_df.to_csv("/out/pcs.csv", index=False)
