In [None]:
# Re-loading and preprocessing the dataset as per the notebook's initial steps
import pandas as pd

# Load the first dataset
data_path = '/mnt/data/GSE218462_raw_counts_GRCh38.p13_NCBI.tsv'
df = pd.read_csv(data_path, sep='\t').T

# Set the column headers and remove the first row used for headers
df.columns = df.iloc[0]
df = df[1:]

# Convert all data to numeric for PCA analysis
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

# Standardize the data as done in the initial notebook
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns, index=df.index)

# Define the 'edited' vs 'unedited' label and add to DataFrame based on initial notebook setup
unedited = ['GSM6745632', 'GSM6745633', 'GSM6745634', 'GSM6745635', 'GSM6745636', 'GSM6745637']
scaled_df['Edited'] = scaled_df.index.map(lambda sample: 0 if sample in unedited else 1)

# Perform PCA as per notebook steps
from sklearn.decomposition import PCA

# Remove the label column for PCA computation
pca_df = scaled_df.drop(columns=['Edited'])

# Initialize PCA with a sufficient number of components
pca = PCA()
pca_result = pca.fit_transform(pca_df)

# Compute explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = explained_variance_ratio.cumsum()

# Collect PCA results in a DataFrame
principalDf = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
principalDf['Edited'] = scaled_df['Edited'].values

# Display first few rows and explained variance summary to continue with plotting and further analysis
principalDf.head(), cumulative_variance_ratio[:10]