In [None]:
# Import libraries. 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from kneed import KneeLocator

# Dimension reduction tools
from sklearn.decomposition import PCA as PCA
from sklearn.manifold import TSNE
import umap 

In [None]:
#Creating Color Palette for Different Populations

#for Oryx datasets:
custom_palette = {
    "EAD_A": "#cada45", #green
    "EAD_B": "#d4a2e1", #purple
    "EEP": "#55e0c6", #blue
    "USA": "#f0b13c", #orange
}

#for Galapagos Datasets:

In [None]:
#Loading the population data and covariance matrix

#load population data
population_names = pd.read_csv('input_files/oryx_pop_info_sorted_46_final.txt', sep='\t', header=0)
#load the covariance matrix
filename='input_files/oryx_6xyh_1K.cov'
cov_mat= pd.read_csv(filename, sep=' ', header=None)
#Generating the pandas dataframe called Data_Struct
Data_Struct=population_names
filenameforplot='SO_6x'

In [None]:
# Function to plot the scree plot
def plot_scree(explained_variance,filename_title,elbow_point):
    plt.figure(figsize=(8, 5))
    # Convert to a simple list if it's not already
    explained_variance = list(explained_variance)
    plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
    plt.axvline(x=elbow_point, color='r', linestyle='--')
    plt.text(elbow_point + 0.1, max(explained_variance) * 0.9, f'Elbow point: {elbow_point}', color='red', verticalalignment='center', fontsize=18)
    plt.title(f'{filename_title}', fontsize=18)
    plt.xlabel('Number of Components', fontsize=18)
    plt.ylabel('Variance Explained', fontsize=18)
    plt.grid()
    #plt.show()
     # Adjust tick label size
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
     # Save the plot as a PDF file
    pdf_filename = f'{filename_title}.pdf'  # Define the PDF file name
    plt.savefig(pdf_filename, format='pdf')  # Save the figure as a PDF
    plt.close()  # Close the figure to avoid displaying it in the notebook

# Function to find the elbow point
def find_elbow_point(explained_variance, sensitivity=1.0):
    explained_variance = list(explained_variance)
    kneedle = KneeLocator(range(1, len(explained_variance) + 1), explained_variance, 
                          curve='convex', direction='decreasing', 
                          S=sensitivity, interp_method='polynomial')
    return kneedle.elbow


#Calculate PCA
#convert covariance matrix to numpy array
cov_mat_np=cov_mat.to_numpy()

# calculate eigen vectors and eigen values from the initial covariance matrix
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat_np)
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True)
feature_vector = np.hstack([eigen_pairs[i][1][:, np.newaxis] for i in range(len(eigen_vals))])
principal_components = cov_mat_np.dot(feature_vector) 

# sorting them from largest to smallest
idx = eigen_vals.argsort()[::-1]   
eigenValues = eigen_vals[idx]
eigenVectors = eigen_vecs[:,idx]

# calculating the total explained variance
expl_pre=eigenValues/sum(eigenValues)
expl=np.cumsum(expl_pre)

expl_df=pd.DataFrame(expl_pre*100,columns=['explained_variance'])
expl_df['cumulative_expl']=expl*100
expl_df.set_index(np.arange(1, eigenVectors.shape[0] + 1), inplace=True)


# Plot the scree plot
#plot_filename = f'scree_plot_{filename_title}.png'

 # Find the elbow point
elbow_point = find_elbow_point(expl_df['explained_variance'])
print("Optimal number of principal components):", elbow_point)

plot_scree(expl_df['explained_variance'],filenameforplot,elbow_point)

In [None]:
#to generate FigS1
from pdf2image import convert_from_path

images = convert_from_path('panelA.pdf')
for i, image in enumerate(images):
    image.save(f'panelA_page{i}.png', 'PNG')