In [90]:
################################################################################
# find functional similarity (according to targeted genes) for antidepressants present in LINCS data using
# the Jaccard algorithm

# author: Ximena Fernandez
# mail:   xfdzciencias@gmail.com

################################################################################

In [91]:
################################################################################
# import modules 
################################################################################

import pandas as pd
import numpy as np
import altair as alt

In [92]:
################################################################################
# data extraction and splitting
################################################################################

## read network construction file with antidepressants (ad) 'Target' column and convert to df
df = pd.read_csv('/Users/ximenafernandezsanchez/Documents/Neurociencias/9no_semestre/PP III/Shared/filtered_long_format_with_symbols.csv')

## separate whole ads-genes network in top (upregulated genes) and bottom (downregulated genes)

top_df = df[df['value'] == 1]
bottom_df = df[df['value'] == -1]

In [93]:
################################################################################
# Jaccard similarity calculation for top network
################################################################################

## create a binary representation of targeted genes for each antidepressant (rows: ad, columns: genes)
adjacency_matrix = pd.crosstab(top_df['Target'], top_df['Source'])

## creates a Jaccard similarity matrix (vectorized calculation according to standardized algorithm)
### creates a matrix that reflects the number of genic targets shared per antidepressant pair (rows: ad, columns: ad)
intersection = adjacency_matrix.dot(adjacency_matrix.T)

### creates a matrix that reflects the number of unique genic targets per antidepressant pair (rows: ad, columns: ad)
union = adjacency_matrix.sum(axis=1).values[:, None] + adjacency_matrix.sum(axis=1).values - intersection

### calculates the Jaccard similarity index per antidepressant pair (rows: ad, columns: ad)
jaccard_matrix_values = intersection / union

## fills the main diagonal with value = 1 (Jaccard similarity for each ad compared to itself)
np.fill_diagonal(jaccard_matrix_values.values, 1)

## creates a df object with the Jaccard similarity matrix (to avoid index and columns having the same name, rename the columns explicitly)
jaccard_top_matrix = pd.DataFrame(jaccard_matrix_values, index=adjacency_matrix.index,  columns=adjacency_matrix.index.copy())
jaccard_top_matrix.index.name = 'node_2'
jaccard_top_matrix.columns.name = 'node_1'

## export Jaccard similarity matrix to a csv file for further analysis
jaccard_top_matrix.to_csv('jaccard_top_matrix.csv')

print(jaccard_top_matrix)

node_1           alaproclate  amitriptyline  amoxapine  bifemelane  bupropion  \
node_2                                                                          
alaproclate         1.000000       0.007194   0.003509    0.003636   0.003676   
amitriptyline       0.007194       1.000000   0.016949    0.000000   0.025000   
amoxapine           0.003509       0.016949   1.000000    0.003390   0.081181   
bifemelane          0.003636       0.000000   0.003390    1.000000   0.014337   
bupropion           0.003676       0.025000   0.081181    0.014337   1.000000   
clomipramine        0.000000       0.010753   0.032258    0.014599   0.061776   
desipramine         0.018587       0.017668   0.000000    0.017921   0.029304   
dibenzepin          0.000000       0.000000   0.049430    0.000000   0.003817   
dosulepin           0.032154       0.015152   0.002941    0.027950   0.031447   
doxepin             0.000000       0.003597   0.021505    0.026119   0.000000   
duloxetine          0.000000

In [94]:
################################################################################
# data visualization for top network
################################################################################

## converts Jaccard df to long format to plot heatmap
jaccard_long_df = jaccard_top_matrix.reset_index().melt(id_vars='node_2')
jaccard_long_df.columns = ['node_1', 'node_2', 'Jaccard']

## limits Jaccard similarity values to 3 decimals for visualization comprehensive purposes
jaccard_long_df['Jaccard'] = jaccard_long_df['Jaccard'].round(3)

## creates and displays a heatmap object
base = alt.Chart(jaccard_long_df).encode(
    alt.X('node_1:O', title=None),
    alt.Y('node_2:O', title=None)
)

heatmap = base.mark_rect().encode(
    alt.Color('Jaccard:Q', scale=alt.Scale(scheme='viridis'), legend=alt.Legend(title=None)),
    tooltip=['node_1:N', 'node_2:N', 'Jaccard:Q']
).properties(
    title='Jaccard similarity of antidepressants based on targeted genes for top network',
    width=1000,
    height=900
)

text = base.mark_text(baseline='middle').encode(
    text='Jaccard:Q',
    color=alt.condition(
        alt.datum.Jaccard > 0.5,
        alt.value('black'),
        alt.value('white')
    )
)

heatmap + text

In [95]:
################################################################################
# Jaccard similarity calculation for bottom network
################################################################################

## create a binary representation of targeted genes for each antidepressant (rows: ad, columns: genes)
adjacency_matrix = pd.crosstab(bottom_df['Target'], bottom_df['Source'])

## creates a Jaccard similarity matrix (vectorized calculation according to standardized algorithm)
### creates a matrix that reflects the number of genic targets shared per antidepressant pair (rows: ad, columns: ad)
intersection = adjacency_matrix.dot(adjacency_matrix.T)

### creates a matrix that reflects the number of unique genic targets per antidepressant pair (rows: ad, columns: ad)
union = adjacency_matrix.sum(axis=1).values[:, None] + adjacency_matrix.sum(axis=1).values - intersection

### calculates the Jaccard similarity index per antidepressant pair (rows: ad, columns: ad)
jaccard_matrix_values = intersection / union

## fills the main diagonal with value = 1 (Jaccard similarity for each ad compared to itself)
np.fill_diagonal(jaccard_matrix_values.values, 1)

## creates a df object with the Jaccard similarity matrix (to avoid index and columns having the same name, rename the columns explicitly)
jaccard_bottom_matrix = pd.DataFrame(jaccard_matrix_values, index=adjacency_matrix.index,  columns=adjacency_matrix.index.copy())
jaccard_bottom_matrix.index.name = 'node_2'
jaccard_bottom_matrix.columns.name = 'node_1'

## export Jaccard similarity matrix to a csv file for further analysis
jaccard_bottom_matrix.to_csv('jaccard_bottom_matrix.csv')

print(jaccard_bottom_matrix)

node_1           alaproclate  amitriptyline  amoxapine  bifemelane  bupropion  \
node_2                                                                          
alaproclate         1.000000       0.013514   0.021661    0.003497   0.036101   
amitriptyline       0.013514       1.000000   0.022305    0.077220   0.125000   
amoxapine           0.021661       0.022305   1.000000    0.003831   0.023438   
bifemelane          0.003497       0.077220   0.003831    1.000000   0.068273   
bupropion           0.036101       0.125000   0.023438    0.068273   1.000000   
clomipramine        0.313901       0.025180   0.042802    0.014925   0.018727   
desipramine         0.043636       0.014545   0.031496    0.015267   0.035019   
dibenzepin          0.013986       0.068182   0.019231    0.018939   0.018939   
dosulepin           0.058621       0.075540   0.014388    0.401961   0.091603   
doxepin             0.237918       0.269531   0.006536    0.283951   0.000000   
duloxetine          0.000000

In [96]:
################################################################################
# data visualization for bottom network
################################################################################

## converts Jaccard df to long format to plot heatmap
jaccard_long_df = jaccard_bottom_matrix.reset_index().melt(id_vars='node_2')
jaccard_long_df.columns = ['node_1', 'node_2', 'Jaccard']

## limits Jaccard similarity values to 3 decimals for visualization comprehensive purposes
jaccard_long_df['Jaccard'] = jaccard_long_df['Jaccard'].round(3)

## creates and displays a heatmap object
base = alt.Chart(jaccard_long_df).encode(
    alt.X('node_1:O', title=None),
    alt.Y('node_2:O', title=None)
)

heatmap = base.mark_rect().encode(
    alt.Color('Jaccard:Q', scale=alt.Scale(scheme='viridis'), legend=alt.Legend(title=None)),
    tooltip=['node_1:N', 'node_2:N', 'Jaccard:Q']
).properties(
    title='Jaccard similarity of antidepressants based on targeted genes for bottom network',
    width=1000,
    height=900
)

text = base.mark_text(baseline='middle').encode(
    text='Jaccard:Q',
    color=alt.condition(
        alt.datum.Jaccard > 0.5,
        alt.value('black'),
        alt.value('white')
    )
)

heatmap + text