In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm
from scipy.spatial import KDTree
import ast
import seaborn as sns
import time
import os
from multiprocessing import Pool
import requests
import ast
from matplotlib.colors import LogNorm
import pickle
import scipy.sparse as sp
from scipy.signal import convolve2d

In [3]:
os.chdir('/content/drive/MyDrive/BRI Analysis')

In [4]:
%run hist_functions.py

### Set up the plotting parameters

In [5]:
bri_names_1d = ['x(N)', 'y(N)', 'z(N)', 'x(A)', 'y(A)', 'z(A)', 'x(C)', 'y(C)', 'z(C)']
bri_names_1d_latex = ['$x_{BRI}(C_{i-1}N_{i})$','$y_{BRI}(C_{i-1}N_{i})$', '$z_{BRI}(C_{i-1}N_{i})$',
                '$x_{BRI}(N_{i}A_{i})$', '$y_{BRI}(N_{i}A_{i})$','$z_{BRI}(N_{i}A_{i})$',
                '$x_{BRI}(A_{i}C_{i})$', '$y_{BRI}(A_{i}C_{i})$', '$z_{BRI}(A_{i}C_{i})$']

bri_input_parameters_1d = [(0.01,-2.0,2.0),(0.01,-2.0,2.0),(0.01,-2.0,2.0),
                           (0.01,-2.0,2.0),(0.01,-2.0,2.0),(0.01,-2.0,2.0),
                           (0.01,-2.0,2.0),(0.01,-2.0,2.0),(0.01,-2.0,2.0)]

bri_names_2d = []
bri_names_2d_latex = []
bri_input_parameters_2d = []
for i in [(0,1),(0,2),(1,2),(3,4),(3,5),(4,5),(6,7),(6,8),(7,8)]:
    bri_names_2d.append((bri_names_1d[i[0]],bri_names_1d[i[1]]))
    bri_names_2d_latex.append((bri_names_1d_latex[i[0]],bri_names_1d_latex[i[1]]))
    bri_input_parameters_2d.append((0.01,0.01,-2.0,2.0,-2.0,2.0))


trin_names_1d = ['x(AN)', 'x(AC)', 'y(AC)']
trin_names_1d_latex = ['$x_{TRIN}(A_{i}N_{i})$', '$x_{TRIN}(A_{i}C_{i})$', '$y_{TRIN}(A_{i}C_{i})$']

trin_input_parameters_1d = [(0.01,-3.0,3.0),(0.01,-3.0,3.0),(0.01,-3.0,3.0)]

trin_names_2d = []
trin_names_2d_latex = []
trin_input_parameters_2d = []
for i in [(0,1),(0,2),(1,2)]:
    trin_names_2d.append((trin_names_1d[i[0]],trin_names_1d[i[1]]))
    trin_names_2d_latex.append((trin_names_1d_latex[i[0]],trin_names_1d_latex[i[1]]))
    trin_input_parameters_2d.append((0.01,0.01,-3.0,3.0,-3.0,3.0))

al_names_1d = ['length(N)', 'length(A)', 'length(C)', 'angle(N)', 'angle(A)', 'angle(C)', 'tau(NA)', 'tau(AC)', 'tau(CN)']

al_names_1d_latex = ['$len(C_{i-1}N_{i})$', '$len(N_{i}A_{i})$', '$len(A_{i}C_{i})$',
                     '$angle(C_{i-1}N_{i}A_{i})$', '$angle(N_{i}A_{i}C_{i})$','$angle(A_{i}C_{i}N_{i+1})$',
                     '$tau(C_{i-1}N_{i}A_{i},N_{i}A_{i}C_{i})$', '$tau(N_{i}A_{i}C_{i},A_{i}C_{i}N_{i+1})$','$tau(A_{i-1}C_{i-1}N_{i},C_{i-1}N_{i}A_{i})$']

al_input_parameters_1d = [(0.01,0.0,3.0),(0.01,0.0,3.0),(0.01,0.0,3.0),
 (0.1,0.0,180.0),(0.1,0.0,180.0),(0.1,0.0,180.0),
  (0.1,-180.0,180.0),(0.1,-180.0,180.0),(0.1,-180.0,180.0)]

al_names_2d = []
al_names_2d_latex = []
al_input_parameters_2d = []
for i in [(0,1),(0,2),(1,2),
          (3,4),(3,5),(4,5),
          (6,7),(6,8),(7,8)]:

    al_names_2d.append((al_names_1d[i[0]],al_names_1d[i[1]]))
    al_names_2d_latex.append((al_names_1d_latex[i[0]],al_names_1d_latex[i[1]]))
    if i in [(0,1),(0,2),(1,2)]:
        al_input_parameters_2d.append((0.01,0.01,0.0,3.0,0.0,3.0))
    elif i in [(3,4),(3,5),(4,5)]:
        al_input_parameters_2d.append((0.1,0.1,0.0,180.0,0.0,180.0))
    elif i in [(6,7),(6,8),(7,8)]:
        al_input_parameters_2d.append((0.1,0.1,-180.0,180.0,-180.0,180.0))
    else:
        al_input_parameters_2d.append((0.01,0.01,-1.0,1.0,-1.0,1.0))


In [6]:
inv_dir = './data/bri_computations/'
invariants = os.listdir(inv_dir)

print(invariants)

['batch_0.parquet', 'batch_1.parquet', 'batch_2.parquet', 'batch_3.parquet', 'batch_4.parquet', 'batch_5.parquet', 'batch_6.parquet', 'batch_7.parquet', 'batch_8.parquet', 'batch_9.parquet', 'batch_10.parquet', 'batch_11.parquet', 'batch_12.parquet', 'batch_13.parquet', 'batch_14.parquet', 'batch_15.parquet', 'batch_16.parquet', 'batch_17.parquet', 'batch_18.parquet', 'batch_19.parquet', 'batch_20.parquet', 'batch_21.parquet', 'batch_22.parquet', 'batch_23.parquet', 'batch_24.parquet', 'batch_25.parquet', 'batch_26.parquet', 'batch_27.parquet', 'batch_28.parquet', 'batch_29.parquet', 'batch_30.parquet', 'batch_31.parquet', 'batch_32.parquet', 'batch_33.parquet', 'batch_34.parquet', 'batch_35.parquet', 'batch_36.parquet', 'batch_37.parquet', 'batch_38.parquet', 'batch_39.parquet', 'batch_40.parquet', 'batch_41.parquet', 'batch_42.parquet', 'batch_43.parquet', 'batch_44.parquet', 'batch_45.parquet', 'batch_46.parquet', 'batch_47.parquet', 'batch_48.parquet', 'batch_49.parquet', 'batch_50

In [7]:
# @title Get list with all chains

clean_output_file = f'./data/cleaned_connective_chains.csv'
final_clean_df = pd.read_csv(clean_output_file)

#replace nan values of final_clean_df['chain_id'] with string 'NA'
final_clean_df['chain_id'] = final_clean_df['chain_id'].fillna('NA')
final_clean_df['chain_id'] = final_clean_df['chain_id'].astype(str)

pdb_ids = final_clean_df['pdb_id'].unique()

In [8]:
# @title Get meta data

meta_data = pd.read_csv('./data/PDB727K_webscrape_meta_data.csv')

#restrict to XRD only and then restrict to resolutions leq 2A
meta_data_xrd = meta_data[meta_data['Method']=='X-RAY DIFFRACTION'].copy()

# 1. Strip the brackets
# '[2.55]'      -> '2.55'
clean_step_1 = meta_data_xrd['Resolution'].astype(str).str.strip('[]')

# 2. Split by comma and take the first element
clean_step_2 = clean_step_1.str.split(',').str[0]

# 3. Convert to numeric
# This will handle the string '2.0' turning into float 2.0
# It also turns empty strings '' into NaN
meta_data_xrd['Resolution'] = pd.to_numeric(clean_step_2, errors='coerce')

meta_data_xrd_lt_2A = meta_data_xrd[meta_data_xrd['Resolution']<=2].copy()
pdb_ids_lt_2A = meta_data_xrd_lt_2A['pdb_id'].unique()

In [None]:
# @title Generate all of the corresponding histograms

for should_restrict in [True, False]:

    if should_restrict:
        pdb_ids_list = pdb_ids_lt_2A
    else:
        pdb_ids_list = pdb_ids

    for i in tqdm.tqdm(os.listdir(inv_dir)):

        try:
            bri_al_data = pd.read_parquet(inv_dir + "/" + i)
            i_name = i.split('.')[0]

            bri_al_data = bri_al_data[bri_al_data['pdb_id'].isin(pdb_ids_list)].copy()

            # Calculate means
            bri_al_data_mean = calculate_means(bri_al_data)

            # Hack for 'tau(CN)'
            bri_al_data['tau(CN)'] = bri_al_data['tau(CN)'].shift(1)

            # Dictionary to store EVERYTHING for this file
            file_results = {}

            # --- 1. Standard Histograms ---
            if len(bri_al_data) > 0:
                # Store tuple: (histograms, xedges, yedges)
                file_results['al_2d'] = create_histogram2d(bri_al_data, al_names_2d, al_input_parameters_2d)
                file_results['bri_2d'] = create_histogram2d(bri_al_data, bri_names_2d, bri_input_parameters_2d)
                file_results['trin_2d'] = create_histogram2d(bri_al_data, trin_names_2d, trin_input_parameters_2d)

                # Store tuple: (histograms, xedges)
                file_results['al_1d'] = create_histogram(bri_al_data, al_names_1d, al_input_parameters_1d)
                file_results['bri_1d'] = create_histogram(bri_al_data, bri_names_1d, bri_input_parameters_1d)
                file_results['trin_1d'] = create_histogram(bri_al_data, trin_names_1d, trin_input_parameters_1d)

            # --- 2. Mean Histograms ---
            if len(bri_al_data_mean) > 0:
                # We use the suffix "_mean" in the key to distinguish
                file_results['al_2d_mean'] = create_histogram2d_means(bri_al_data_mean, al_names_2d, al_input_parameters_2d)
                file_results['bri_2d_mean'] = create_histogram2d_means(bri_al_data_mean, bri_names_2d, bri_input_parameters_2d)
                file_results['trin_2d_mean'] = create_histogram2d_means(bri_al_data_mean, trin_names_2d, trin_input_parameters_2d)

                file_results['al_1d_mean'] = create_histogram_means(bri_al_data_mean, al_names_1d, al_input_parameters_1d)
                file_results['bri_1d_mean'] = create_histogram_means(bri_al_data_mean, bri_names_1d, bri_input_parameters_1d)
                file_results['trin_1d_mean'] = create_histogram_means(bri_al_data_mean, trin_names_1d, trin_input_parameters_1d)

            # --- Write ONCE per file ---
            if file_results:
                # Saves as "{filename}_data.pkl" (or _restrict.pkl)
                pickler_write(f"{i_name}_data", file_results, restrict=should_restrict)

        except Exception as e:
            print(f"Skipping {i} due to error: {e}")
            continue

100%|██████████| 146/146 [37:07<00:00, 15.25s/it]
100%|██████████| 146/146 [1:02:33<00:00, 25.71s/it]
