# The geometry of hidden representations in protein language models

### Import libraries

In [2]:
import sys
sys.path.insert(0,'src/')

import numpy as np
import os

import plotly.graph_objects as go
from plotly.graph_objects import Layout

#from intrinsic_dimension import block_analysis, save_ID_results, plot_curve_ID
from neighborhood_overlap import mapping, get_data, overlap_label, overlap_layer, plot_no, update_figure



## 1. Intrinsic Dimension

### Download ProteinNet data and estimate ID

In [6]:
# choose the model you are interested in between: 'esm1b', 'esm1v', 'ProtBert', 'ProtT5', 'esm2-'...
model = 'esm1b'

# take mapping values for the model
map = mapping[model]

# create dict which will store results for each reps 
reps_id = {i : [] for i in map[2]}

# define path of input and output
device = 'lucrezia'
if device == 'orfeo':
    input_path = '/Users/lucreziavaleriani/Desktop/mount_orfeo/data_repo/pdist/'+ model
else:
    input_path = '/Users/lucreziavaleriani/Desktop/'
    res_path = '/Users/lucreziavaleriani/Desktop/'


In [7]:
# for each rep calculate the id value, save the results of the procedure if needed, 
# and store the mean value of ID inside the dictionary
for rep in reps_id.keys():
    dist_mat = np.load(os.path.join(input_path,'pdist_pnet_rep'+str(rep)+'.npy'))
    
    # d = data.Data(dist_mat)
    # print('start')
    # id_twoNN_noisy, _, r_noisy = d.compute_id_2NN()


    
    #dim,std,n_point = block_analysis(dist_mat, blocks=list(range(1, 21)), fraction=0.9)
    #esm1b = save_ID_results('id_0.csv',dim,std,n_point)

    #reps_id[rep].append(np.mean(dim))
    


start


### Plot curve(s)

In [24]:
layout = Layout(plot_bgcolor='rgba(0,0,0,0)')
fig = go.Figure(layout=layout)

plot_curve_ID(fig,len(map[2])/(len(map[2])-1),reps_id.values(),map[1],map[0])

fig.update_xaxes(showline=True, linewidth=1, linecolor='black',showticklabels=True, tickmode = 'array',tickvals = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],range=[-0.01,1.05],title='relative depth',ticks='outside')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black',showticklabels=True, tickmode = 'array',tickvals = [i for i in range(0,26,2)],title='ID',range=[0,24],ticks='outside')

fig.update_layout(width=900, height=700, font=dict(size=20),
                  legend=dict(orientation="h",
                              yanchor="top",
                              y=1.12,
                              xanchor="center",
                              x=0.5, font = dict(size = 25)
                              )
                    ) 
#pio.write_image(fig, '/u/area/lvaleriani/scratch/esm2/curve/svgplots/pnet_esm1.png',scale=5,width=900, height=700)

SyntaxError: unexpected EOF while parsing (2246470315.py, line 19)

## Neighboorhod Overlap 

### Neighborhood rearrangements - ProteinNet

In [3]:
path_pnet = '/Users/lucreziavaleriani/Desktop/mount_orfeo/data_repo/no/pnet/'

# choose the model you are interested in between: 'esm1b', 'esm1v', 'ProtBert', 'ProtT5', 'esm2-'...
model_pnet = ['esm2-35M','esm2-8M']
d_model_pnet = { model:{} for model in model_pnet}

# take mapping values for the model
for model in model_pnet:
    d_model_pnet[model]['map'] = mapping[model]
    d_model_pnet[model]['nlayer'] = len(d_model_pnet[model]['map'][2]) - 1
    d_model_pnet[model]['path'] = path_pnet + model + '/rep'


In [4]:
for model in model_pnet:
    d_model_pnet[model]['ds'] = get_data(d_model_pnet[model]['path'], d_model_pnet[model]['nlayer'], ng = 20)

In [None]:
for model in model_pnet:
    nlayer = d_model_pnet[model]['nlayer']

    d_model_pnet[model]['overlap_first'] = np.array([overlap_layer(d_model_pnet[model]['ds'], l, 0, k = 10) for l in range(nlayer+1)])
    d_model_pnet[model]['overlap_last'] = np.array([overlap_layer(d_model_pnet[model]['ds'], l, nlayer, k = 10) for l in range(nlayer+1)])
    d_model_pnet[model]['overlap_next'] = np.array([overlap_layer(d_model_pnet[model]['ds'], l, l+1, k = 10) for l in range(nlayer)])

In [None]:
fig = go.Figure(layout=Layout(plot_bgcolor='rgba(0,0,0,0)')).set_subplots(1, 3, horizontal_spacing=0.1,vertical_spacing=0.02)
for model in model_pnet:
    nlayer = d_model_pnet[model]['nlayer']

    plot_no(fig, nlayer-1, d_model_pnet[model]['overlap_first'], d_model_pnet[model]['map'][1], d_model_pnet[model]['map'][0], r'$\chi^{first}$', 1, 1, True)
    plot_no(fig, nlayer, d_model_pnet[model]['overlap_last'], d_model_pnet[model]['map'][1], d_model_pnet[model]['map'][0], r'$\chi^{last}$', 1, 2, False)
    plot_no(fig, nlayer-1, d_model_pnet[model]['overlap_next'], d_model_pnet[model]['map'][1], d_model_pnet[model]['map'][0], r'$\chi^{l,l+1}$', 1, 3, False)

update_figure(fig, w = 900, h = 500)

### Remote Homology - Superfamily

In [None]:
# choose the model you are interested in between: 'esm1b', 'esm1v', 'ProtBert', 'ProtT5', 'esm2-'...

path_sp = '/Users/lucreziavaleriani/Desktop/mount_orfeo/data_repo/no/rh_sp/'

model_sp = ['esm2-35M','esm2-8M']
d_model_sp = { model:{} for model in model_sp}

# take mapping values for the model
for model in model_sp:
    d_model_sp[model]['map'] = mapping[model]
    d_model_sp[model]['nlayer'] = len(d_model_sp[model]['map'][2]) - 1
    d_model_sp[model]['path'] = path_sp + model + '/rep'
    d_model_sp[model]['label_path'] = path_sp ## HARD CODED

In [None]:
for model in model_sp:
    d_model_sp[model]['ds'] = get_data(d_model_sp[model]['path'], d_model_sp[model]['nlayer'], d_model_sp[model]['label_path'], ng = 20)

0
here
1
here
2
here
3
here
4
here
5
here
6
here
7
here
8
here
9
here
10
here
11
here
12
here
0
here
1
here
2
here
3
here
4
here
5
here
6
here


In [None]:
for model in model_sp:
    d_model_sp[model]['overlap_label'] = np.array(overlap_label(d_model_sp[model]['ds'], k = 10))

In [None]:
fig = go.Figure(layout=Layout(plot_bgcolor='rgba(0,0,0,0)')).set_subplots(1, 1, horizontal_spacing=0.1, vertical_spacing=0.02)
for model in model_sp:
    plot_no(fig, d_model_sp[model]['nlayer'], d_model_sp[model]['overlap_label'], d_model_sp[model]['map'][1], d_model_sp[model]['map'][0], r'$\chi^{gt}$')
update_figure(fig, w = 600, h= 500)