In [9]:
import pandas as pd
from Bio import Entrez
from Bio import ExPASy
from Bio import SwissProt
Entrez.email = "user@gmail.com"

## Load in dataset

In [7]:
cmut = pd.read_csv('../data/cancermuts_MZF1.csv')
target_protein_ddgs = pd.read_csv('../data/energies.csv')

### Get SCAN domain location

In [10]:
def fetch_uniprot_record(uniprot_id):
    with ExPASy.get_sprot_raw(uniprot_id) as handle:
        return SwissProt.read(handle)

def get_domain_indices(uniprot_record):
    domain_indices = {}
    for feature in uniprot_record.features:
        if feature.type == 'DOMAIN':
            domain_indices[feature.qualifiers['note']] = [feature.location.start, feature.location.end]
    return domain_indices

In [15]:
target_protein_uniprot = fetch_uniprot_record('P28698')
domains = get_domain_indices(target_protein_uniprot)

start_scan, end_scan = domains['SCAN box']

ExactPosition(43)

### Select indices from cancer muts

In [31]:
# Select subset of cmut, which contains only mutations in the SCAN box
cmut_scan = cmut[(cmut['aa_position'] >= start_scan) & (cmut['aa_position'] <= end_scan)]

# Drop all rows with missing values in alt_aa
cmut_scan = cmut_scan.dropna(subset=['alt_aa'])

# Reset index
cmut_scan = cmut_scan.reset_index(drop=True)

# Count number of rows
n_mutations = cmut_scan.shape[0]

### Match mutations from ddgs with cancermut

In [86]:
cancer_mutations = {}

# Iterate over each row in cmut_scan
for row in cmut_scan.itertuples():
    # Fetch the wildtype and mutant amino acids
    wt_aa = row.ref_aa
    mut_aa = row.alt_aa
    # Fetch the position of the mutation
    position = row.aa_position

    # Fetch the mutation energy from target_protein_ddgs
    ddg_row = target_protein_ddgs[(target_protein_ddgs['Residue #'] == position)]
    ddg = ddg_row[mut_aa].values[0]

    # Store the mutation energy in the dictionary
    cancer_mutations[f'{wt_aa}{position}{mut_aa}'] = ddg

cancer_mutations

{'R44C': 0.83912,
 'R44H': 1.80213,
 'L45M': -0.0755,
 'R46C': 0.98232,
 'R46H': 0.86302,
 'F47S': 2.79973,
 'R48Q': 0.77259,
 'R48W': 0.51392,
 'R51H': 0.67379,
 'G57R': 1.61078,
 'G57W': 3.20923,
 'P58L': 0.52546,
 'A63T': 0.39603,
 'C69Y': -1.68801,
 'R70H': 0.44432,
 'R70L': -0.12486,
 'Q71R': -0.19218,
 'R74C': 2.07742,
 'R74H': 0.57923,
 'P75T': 2.5787,
 'R78C': 2.45512,
 'M83V': 0.71591,
 'L86V': 0.68414,
 'E90G': 1.2914,
 'G94S': 0.28357,
 'A95T': 0.2576,
 'P97L': 2.90791,
 'E99K': 1.39013,
 'E99Q': 0.92478,
 'I100N': 3.00949,
 'A102T': 0.04224,
 'G106W': 0.13791,
 'R108L': 1.37251,
 'E113G': 1.42655,
 'E113K': 0.85259,
 'E114K': -0.36507,
 'L118P': 4.80013,
 'R123C': 0.95158,
 'R124L': 0.77423,
 'E125K': -0.42361}

In [81]:
# Check how many mutations are above the threshold absolute 2.0 kcal/mol 
n_mutations_stabilizing = sum([1 for ddg in cancer_mutations.values() if ddg > 2.0])
n_mutations_neutral = sum([1 for ddg in cancer_mutations.values() if abs(ddg) <= 2.0])
n_mutations_destabilizing = sum([1 for ddg in cancer_mutations.values() if ddg < -2.0])

print(f'Number of mutations: {n_mutations}')
print(f'Number of stabilizing mutations: {n_mutations_stabilizing}')
print(f'Number of neutral mutations: {n_mutations_neutral}')
print(f'Number of destabilizing mutations: {n_mutations_destabilizing}')


Number of mutations: 40
Number of stabilizing mutations: 8
Number of neutral mutations: 32
Number of destabilizing mutations: 0


### Select stabilizing or destabilizing mutations

In [43]:
amino_acids = target_protein_ddgs.columns[3:].values

In [85]:
# Keep only rows with values above 2 or below -2 in amino_acids columns
target_protein_ddgs_thresholded = target_protein_ddgs[target_protein_ddgs[amino_acids].apply(lambda x: x.abs() > 2).any(axis=1)]
target_protein_ddgs_thresholded

Unnamed: 0,WT residue type,chain ID,Residue #,G,A,V,L,I,M,F,...,T,C,Y,N,Q,D,E,K,R,H
3,F,A,47,3.42244,2.13592,1.77024,-0.02192,1.80926,0.20279,-0.00280,...,2.66995,2.25383,0.32704,2.69600,1.77852,3.38536,2.65121,1.35347,1.58603,2.20909
4,R,A,48,1.31879,0.88199,1.20100,0.17514,0.74971,-0.12878,0.80316,...,1.03416,0.78517,0.64717,0.07568,0.77259,0.95081,0.66792,0.05241,-0.00373,1.14795
5,C,A,49,0.34146,0.27020,0.86087,-0.62206,0.42207,-0.60659,-0.40163,...,0.22744,-0.01246,-0.42019,-0.12575,-0.10587,0.20885,0.01121,-0.25369,-0.39015,0.07064
6,F,A,50,4.76570,3.98788,3.85820,1.94493,3.02370,1.51393,-0.01452,...,3.40208,3.78489,1.69770,4.24162,3.85825,5.22674,4.78152,3.55783,4.60678,3.38200
10,E,A,54,1.81854,1.15818,1.96924,0.13339,1.50806,0.18300,0.44957,...,0.71204,1.18795,0.65055,0.66940,0.53608,0.19198,-0.05606,0.79324,1.11500,1.29404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,D,A,120,0.44441,-0.32840,-0.16155,-0.90871,-0.67685,-1.21087,-0.81166,...,-0.36068,0.11618,-0.73722,-0.46974,-0.38918,-0.00106,-0.52250,-0.59462,-0.88843,-0.06133
77,G,A,121,0.00000,-0.96237,-0.69805,-1.39964,-1.28458,-2.05061,-1.18047,...,-0.40916,-0.07043,-1.17149,-0.62107,-1.26949,-0.02438,-0.70323,-1.19639,-1.13339,-0.47717
78,L,A,122,3.26117,1.69376,2.74528,-0.00846,1.33506,0.04320,-0.25410,...,2.67839,2.34500,0.95772,1.94982,2.73183,3.64249,2.82995,1.93052,1.93091,1.78189
79,R,A,123,0.85712,0.70247,1.84200,0.35544,1.48981,-0.05607,0.04893,...,1.71708,0.95158,0.13445,1.06764,0.50416,2.19407,1.36348,0.18090,0.00258,0.34266
