In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Feature family mapping

In [None]:
# Load the dataframe of feature families
feature_family_df=pd.read_excel("Supplementary_Table_S2.xlsx",index_col="Feature")
feature_family_df=feature_family_df.loc[:,["Feature_ID","Family","Type"]]
feature_family_df["Family"]=feature_family_df.Family.astype("category")
feature_family_df.Family.cat.categories

In [None]:
feature_family_df["Family"]=feature_family_df["Family"].cat.reorder_categories(['Aggregation','LowFolding', 'AlphaHelix', 'BetaSheet',
       'Burial', 'Disorder', 'Hydrophobicity','Membrane', 'Turn','NucleicAcidBinding','Phenomenological',
       'Secondary_Structure_Full','Secondary_Structure_Ext','Geometry','Charge', 'AlphaFold_Disorder',
        'RNABinding'])
feature_mapping_dict=dict(zip(feature_family_df.index,feature_family_df.Feature_ID))

# Load the results of the features vs iterations analysis

In [None]:
res=pd.read_csv("results.txt",delimiter="\t",header=None)

In [None]:
sel_ft=[]
for i in range(len(res)):
    sel_ft.append(len(res.iloc[i,6:].dropna()))

## Plot the results

In [None]:
import matplotlib
matplotlib.rcParams.update({'font.size': 14})

fig,ax=plt.subplots(1,3,figsize=(14,4))

ax[0].set_xlabel("Iteration")
ax[0].set_ylabel("AUROC")
ax[0].scatter(res[0],res[2],s=10,lw=3,label="Train")
ax[0].scatter(res[0],res[3],s=10,lw=3,label="Test")

ax[1].set_xlabel("Size of feature pool")
ax[1].set_ylabel("AUROC")
ax[1].scatter(res[4],res[2],s=10,lw=3,label="Train")
ax[1].scatter(res[4],res[3],s=10,lw=3,label="Test")
# ax[1].set_xscale("log")
ax[1].legend(loc=0)
ax[2].set_ylabel("Number of selected features")
ax[2].set_xlabel("Iteration")
ax[2].scatter(res[0],sel_ft,s=10,lw=3)
fig.tight_layout()
# plt.show()
plt.savefig("Rec_Ft_Elim.pdf",bbox_inches="tight"),plt.close()

In [None]:
X=pd.read_csv("../DATASETS/TrainSet_data.csv",index_col=0)
X_phys_chem=X.iloc[:,:82]

In [None]:
# Correlation clustermap
X_new=X.copy()
X_new.columns=X_new.columns.map(feature_mapping_dict)
corrmat1 = X_new.corr()

In [None]:
import seaborn as sns
fig=sns.clustermap(corrmat1, method="complete", cmap='Spectral_r', annot=False,yticklabels=True,xticklabels=True ,
               annot_kws={"size": 6}, vmin=-1, vmax=1, figsize=(25,25));
fig.savefig("Clustermap_New.pdf")
plt.show()

## Correlation matrices between iterations

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
import plotly.graph_objects as go
from matplotlib.backends.backend_pdf import PdfPages

sns.set(font_scale=1.4)

# Function to compute correlation matrix and plot using seaborn
def plot_correlation_matrix(matrix, iteration_labels,selected_features):
    
    with PdfPages("Corr_Matrices_Iterations_New.pdf") as pdf:
        correlation_matrices = []

        for i in range(len(iteration_labels) - 1):
            features_i = selected_features[i]
            features_j = selected_features[i + 1]

            matrix_i = matrix.loc[:, features_i]
            matrix_j = matrix.loc[:, features_j]
        

            correlation_matrix = pd.concat([matrix_i, matrix_j], axis=1).corr()
            correlation_matrix = correlation_matrix.loc[selected_features[i],selected_features[i+1]]

            correlation_matrices.append(correlation_matrix)

        for i, corr_matrix in enumerate(correlation_matrices):
            print(res.iloc[i,2],res.iloc[i+1,2])
            fig=sns.clustermap(corr_matrix, cmap='coolwarm', annot=False, fmt=".2f",yticklabels=True,xticklabels=True,
                       figsize=(30, 30))
                    
            plt.title(f'Correlation Matrix: Iteration {iteration_labels[i]} to {iteration_labels[i+1]} %.2f %.2f' % (res.iloc[i,2],res.iloc[i+1,2]),
                 fontsize=32)
            pdf.savefig()
            plt.close()




# Example usage
matrix_path = "../DATASETS/TrainSet_data.csv"
iteration_labels=list(res[0])
matrix = pd.read_csv(matrix_path, index_col=0)
matrix = matrix.fillna(1.0)
matrix.columns=matrix.columns.map(feature_mapping_dict)
selected_features=[]
selected_features_mapped=[]

for i in range(len(res)):
    selected_features.append(list(res.iloc[i,6:].dropna()))
    selected_features_mapped.append(list(res.iloc[i,6:].dropna().map(feature_mapping_dict)))
plot_correlation_matrix(matrix, iteration_labels,selected_features_mapped)

## Correlation Graph of features vs iterations

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
import plotly.graph_objects as go
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from ete3 import Tree

# Function to plot clustering tree with arrows based on correlation
def plot_clustering_tree_with_arrows(matrix, iteration_labels,selected_features, correlation_threshold=0.8):
    G = nx.Graph()
    print(len(iteration_labels))

    for i in range(len(iteration_labels) - 1):
        print(i)
        features_i = selected_features[i]
        features_j = selected_features[i + 1]

        matrix_i = matrix.loc[:, features_i]
        matrix_j = matrix.loc[:, features_j]
        correlation_matrix = pd.concat([matrix_i, matrix_j], axis=1).corr()
        correlation_matrix = correlation_matrix.loc[selected_features[i],selected_features[i+1]]

        for feature_i in features_i:
            for feature_j in features_j:
                correlation = correlation_matrix.loc[feature_i, feature_j]
                # print(i,correlation)

                if abs(correlation) > correlation_threshold:
                    G.add_edge(f'{iteration_labels[i]}_{feature_i}', f'{iteration_labels[i+1]}_{feature_j}',
                               weight=correlation)
       
    return G

# Example usage

matrix_path = "../DATASETS/TrainSet_data.csv"
iteration_labels=list(res[0])
matrix = pd.read_csv(matrix_path, index_col=0)
matrix = matrix.fillna(1.0)
matrix.columns=matrix.columns.map(feature_mapping_dict)
selected_features=[]
selected_features_mapped=[]

for i in range(len(res)):
    selected_features.append(list(res.iloc[i,6:].dropna()))
    selected_features_mapped.append(list(res.iloc[i,6:].dropna().map(feature_mapping_dict)))


G= plot_clustering_tree_with_arrows(matrix, iteration_labels,selected_features_mapped)

In [None]:
i=0
for sel_ft in selected_features_mapped:
    for ft in sel_ft:
        ft=str(i)+"_"+ft
        if ft not in list(G.nodes):
            G.add_node(ft)
    i+=1

In [None]:
G.nodes

In [None]:
from sklearn.preprocessing import LabelEncoder

# Group nodes by their prefixes
# Create a directed graph and add edges based on your criteria
G = nx.DiGraph(G)

# Use spring_layout for graph layout
pos = nx.spring_layout(G, seed=42)

grouped_nodes = {}
for node, position in pos.items():
    prefix = node.split('_')[0]
    if prefix not in grouped_nodes:
        grouped_nodes[prefix] = []
    grouped_nodes[prefix].append((node, position))

# Sort groups by prefix and then by the number following the underscore
sorted_groups = sorted(grouped_nodes.items(), key=lambda x: (int(x[0]), x[1]))

# Calculate equally spaced x-coordinates for each group
x_offsets = {}
x_increment = 0.1
for i, (prefix, nodes) in enumerate(sorted_groups):
    x_offsets[prefix] = i * x_increment

# Calculate equally spaced and centered y-coordinates for each node within a group
y_offsets = {}
y_increment = 0.2
for prefix, nodes in sorted_groups:
    num_nodes = len(nodes)
    for i, (node, position) in enumerate(nodes):
        y_offsets[node] = (i - (num_nodes - 1) / 2) * y_increment + 0.5

new_positions = {}
node_colors = []
for prefix, nodes in sorted_groups:
    for node, position in nodes:
        new_positions[node] = np.array([x_offsets[prefix], y_offsets[node]])
        node_colors.append(prefix)
# Convert node_colors to numeric values using LabelEncoder
label_encoder = LabelEncoder()
numeric_colors = label_encoder.fit_transform(node_colors)

In [None]:
# Draw the graph
plt.figure(figsize=(40, 40))
nx.draw(G, new_positions, with_labels=True, arrows=False, node_size=800, font_size=11, font_color="black",node_color="skyblue",
        font_weight="bold", width=1, alpha=0.7, edge_color="gray")

plt.savefig("Features_vs_Iteration_Graph_corr_08.pdf",bbox_inches="tight"),plt.close()

In [None]:
new_selected_features=[]

for sel_ft in selected_features:
    tmp_sel_ft=[i for i in sel_ft if i in list(families_df.Feature)]
    new_selected_features.append(tmp_sel_ft)

In [None]:
# Convert the DataFrame to a dictionary where features are keys and families are values
families = feature_family_df.set_index('Feature_ID')['Family'].to_dict()

## Stacked Area Chart

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

sns.reset_defaults()
# Example data
iteration_data = selected_features_mapped

# Example DataFrame
feature_families_df = feature_family_df.copy()

# Create a mapping from feature to family and color
feature_mapping = dict(zip(feature_families_df['Feature_ID'], feature_families_df['Family']))
mapped_iteration_data = [[feature_mapping[feature] for feature in iteration] for iteration in iteration_data]

# Create a DataFrame with counts for each family at each iteration
iteration_counts = {}

for i, iteration in enumerate(mapped_iteration_data, start=1):
    for family in iteration:
        iteration_counts.setdefault(f'Iteration {i}', {}).setdefault(family, 0)
        iteration_counts[f'Iteration {i}'][family] += 1

stacked_data = pd.DataFrame(iteration_counts).T.fillna(0)

# Plot the stacked area chart
fig, ax = plt.subplots()
# print([color_mapping[family] for family in stacked_data.columns])

stacked_data.index=np.arange(1,16,1)
# Use the colors directly from the color_mapping
# stacked_data.plot(kind='area', stacked=True,colormap="Pastel2",ax=ax)#, colormap=[color_mapping[family] for family in stacked_data.columns], ax=ax)
colors = plt.cm.tab20.colors[:stacked_data.shape[1]]
stacked_data.plot(kind='area', stacked=True, color=colors, ax=ax)
# Customize the plot
ax.set_title('Stacked Area Chart of Feature Families')
ax.set_xlabel('Iteration')
ax.set_ylabel('Count')
ax.legend(title='Family', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.savefig("stacked_area_chart_feature_families_vs_iteration.pdf",bbox_inches="tight")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

sns.reset_defaults()
stacked_data_percentage=stacked_data.div(stacked_data.sum(axis=1), axis=0) * 100

# Plot the stacked area chart
fig, ax = plt.subplots()
# print([color_mapping[family] for family in stacked_data.columns])

stacked_data_percentage.index=np.arange(1,16,1)
# Use the colors directly from the color_mapping
# stacked_data.plot(kind='area', stacked=True,colormap="Pastel2",ax=ax)#, colormap=[color_mapping[family] for family in stacked_data.columns], ax=ax)
colors = plt.cm.tab20.colors[:stacked_data_percentage.shape[1]]
stacked_data_percentage.plot(kind='area', stacked=True, color=colors, ax=ax)
# Customize the plot
ax.set_title('Stacked Area Chart of Feature Families')
ax.set_xlabel('Iteration')
ax.set_ylabel('Percentage')
ax.legend(title='Family', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.savefig("stacked_area_chart_feature_families_vs_iteration_percentage.pdf",bbox_inches="tight")
plt.show()

In [None]:
family_colors=dict(zip(stacked_data.columns,colors))