# Proteins

Standard name - systematic name

STE24 - YJR117W

RCE1  - YMR274C

RAM1  - YDL090C

Do not form complexes with each other


Found three useful ones:

- YDR388W (Cytoskeleton one): community 12; betweenness, information, PCA; general partition

- YLR025W (Endosomal one): community 8; information, subgraph, PCA; general partition

- YMR037C (DNA binding transcription): community 0, betweenness, information; SA partition

In [1]:
import networkx as nx

In [2]:
'''
STE24 - YJR117W
RCE1  - YMR274C
RAM1  - YDL090C
'''
# Proteins that we focus on
def print_protein(str):
    if str == 'YJR117W':
        return 'STE24(YJR117W)'
    elif str == 'YMR274C':
        return 'RCE1(YMR274C)'
    elif str == 'YDL090C':
        return 'RAM1(YDL090C)'

In [3]:
G = nx.read_weighted_edgelist("cleanData.txt",comments="#",nodetype=str)

In [4]:
# Store proteins into a list
all_proteins = ['YJR117W', 'YMR274C', 'YDL090C']

# First set

In [5]:
# Union all findings
general = {'YER095W', 'YBR010W', 'YLR025W', 'YBL037W', 'YGL045W', 'YDR477W', 'YLR113W', 'YDR388W'}
SA = {'YGR152C', 'YFL026W', 'YLR113W', 'YMR037C', 'YNL098C', 'YNL298W', 'YKL178C'}

# first set
# general n = 4, SA n = 3 number of chosen adjacent communities 
# only choose one representative each community
first_set = general.union(SA)

print(first_set)

{'YBR010W', 'YLR113W', 'YGR152C', 'YNL298W', 'YMR037C', 'YLR025W', 'YNL098C', 'YBL037W', 'YDR388W', 'YGL045W', 'YFL026W', 'YER095W', 'YDR477W', 'YKL178C'}


In [6]:
# check overlap with previous proposal
previous = {'YDL100C', 'YJL154C', 'YDR072C', 'YLL013C', 'YNL064C', 'YBL037W', 'YOL147C', 'YGL013C', 'YLR113W', 'YOR065W', 'YDR388W', 'YOL013C', 'YPL106C', 'YDR477W'}
overlap_proteins = set()

for node in general:
    if node in previous:
        overlap_proteins.add(node)

for node in SA:
    if node in previous:
        overlap_proteins.add(node)   
        
n_overlap = len(overlap_proteins)

print(f'There are {n_overlap} as the previous proposal: {overlap_proteins}')

There are 4 as the previous proposal: {'YLR113W', 'YBL037W', 'YDR477W', 'YDR388W'}


# Second set

In [7]:
# adjacent community directly connect to the target proteins, one represeentative each
# n = 4 in general, n = 3 in SA
# one each
general_direct_adjacent = {'YOR065W', 'YPL106C', 'YDR477W', 'YDR072C', 'Q0130', 'YLL013C', 'YGR281W', 'YBL037W', 'YNL064C', 'YLR113W', 'YAL058W', 'YDR388W'}

# ranked ajacent community with two representatives each
# n = 4 in general, n = 3 in SA
# two each, only return the second representatives here
general_second_representatives = {'YHL007C', 'YML032C', 'YOL062C', 'YLR113W', 'YCL008C', 'YLR262C', 'YNL030W', 'YHL027W', 'YDL077C', 'YBL016W', 'YOL012C'}
SA_second_representatives = {'YKL209C', 'YFL026W', 'YJR092W', 'YJL128C', 'YIL033C', 'YLR319C', 'YMR037C', 'YNL098C', 'YNL298W', 'YGR040W', 'YKL178C'}


# select more ranked community - general partition n = 7, SA stays, cuz it makes no sense to expand
# n = 4 in general, n = 3 in SA
# one each
more_ranked = {'YMR250W', 'YER095W', 'YBR010W', 'YLR025W', 'YBL037W', 'YPL106C', 'YGR281W', 'YGL026C', 'YGL045W', 'YNL064C', 'YDR477W', 'YDR072C', 'YLR113W', 'YNR001C', 'YAL058W', 'YDR388W'}

# more focus on STE24('YJR117W')
# Adjacent communities directly linked to STE24
# return the second representatives
STE24_second_representatives = ['YMR186W', 'Q0130', 'YJR121W', 'YGL187C', 'YNL064C']


In [8]:
# prroposed set
# general_direct_adjacent + more_ranked + STE24_second_representatives
second_set = {'YGR281W', 'YGL026C', 'YNR001C', 'YMR250W', 'Q0130', 'YAL058W', 'YGL187C', 'YJR121W', 'YMR186W'}

# Union results and check shortest paths to the targets

In [9]:
# make all the proposed ones in to a set of proteins to filter
proposed = [previous, first_set, second_set]
filter = set()

for s in proposed:
    for protein in s:
        filter.add(protein)

# union all the result
# current_found = [general_direct_adjacent, general_second_representatives, SA_second_representatives, more_ranked]
current_found = [SA_second_representatives]

current_set = set()
for found_ls in current_found:
    for protein in found_ls:
        if protein not in filter:
            current_set.add(protein)

print(f'Current set of potein: {current_set}')

Current set of potein: {'YLR319C', 'YGR040W', 'YKL209C', 'YJR092W', 'YJL128C', 'YIL033C'}


In [10]:
# shortest path to the target
for target_protein in all_proteins:
    print('- ' + print_protein(target_protein) + ':')
    for rep in current_set:
        path = nx.shortest_path(G, source=target_protein, target=rep, weight=None, method='dijkstra')
        print(f"Shortest path to {rep}: {path}")
    print('')

- STE24(YJR117W):
Shortest path to YLR319C: ['YJR117W', 'YBL032W', 'YLR319C']
Shortest path to YGR040W: ['YJR117W', 'YAL005C', 'YGR040W']
Shortest path to YKL209C: ['YJR117W', 'YKL209C']
Shortest path to YJR092W: ['YJR117W', 'YAR042W', 'YJR092W']
Shortest path to YJL128C: ['YJR117W', 'YAL005C', 'YJL128C']
Shortest path to YIL033C: ['YJR117W', 'YAL005C', 'YIL033C']

- RCE1(YMR274C):
Shortest path to YLR319C: ['YMR274C', 'YBL032W', 'YLR319C']
Shortest path to YGR040W: ['YMR274C', 'YBL032W', 'YGR040W']
Shortest path to YKL209C: ['YMR274C', 'YKL209C']
Shortest path to YJR092W: ['YMR274C', 'YCL030C', 'YJR092W']
Shortest path to YJL128C: ['YMR274C', 'YBL032W', 'YJL128C']
Shortest path to YIL033C: ['YMR274C', 'YBR164C', 'YIL033C']

- RAM1(YDL090C):
Shortest path to YLR319C: ['YDL090C', 'YBR023C', 'YLR319C']
Shortest path to YGR040W: ['YDL090C', 'YAL005C', 'YGR040W']
Shortest path to YKL209C: ['YDL090C', 'YKL209C']
Shortest path to YJR092W: ['YDL090C', 'YAR042W', 'YJR092W']
Shortest path to YJ