In [13]:
%%time
# Import necessary Libraries
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import pickle
boolVal = True
attedDataName = 'Overlapping_GenesMR.p'
bioGRIDfileName = 'BioGRID_with_ATTEDMR.p'
if boolVal:
    # Reading in BioGRID as DataFrame
    path = os.path.join('C:\\Users\\ysman\\OneDrive\\Desktop\\project_data\\BIOGRID-ORGANISM-Arabidopsis_thaliana_Columbia-3.5.181.tab2.txt')
    bioGRID_file = open(path, "r") 
    testFile = open(os.path.join('C:\\Users\\ysman\\OneDrive\\Desktop\\project_directory\\data\\test.txt'), 'r')
    bioGRID_DF = pd.read_csv(bioGRID_file, sep = '\t')
    # Simplified DataFrame to only include interactions
    simplebGRID = bioGRID_DF[['Entrez Gene Interactor A','Entrez Gene Interactor B']]
    # Identifying Organisms Present in BioGRID
    OrganismTypesA = list(bioGRID_DF['Organism Interactor A'].unique())
    OrganismTypesB = list(bioGRID_DF['Organism Interactor B'].unique())
    OrganismTypesA.sort()
    OrganismTypesB.sort()
    OrganismTypes = list(set([*OrganismTypesA, *OrganismTypesB]))
    OrganismTypes.sort()
    # Use NCBI's Taxonomy Name/ID Status Report - Plug in OrganismTypes and get .txt
    organismIDs = pd.read_csv('../../project_data/tax_report.txt', sep = '\t')
    organismIDs.drop(columns = ['|','|.1','|.2', 'code', 'primary taxid'],inplace = True)
    # Categorize into different subsets based on organism ID. We know '3702' is Arabidopsis
    mask1 = bioGRID_DF['Organism Interactor A'] == 3702
    mask2 = bioGRID_DF['Organism Interactor B'] == 3702
    onlyArabDF = bioGRID_DF[mask1& mask2]
    oneArabDF = bioGRID_DF[~mask1|~mask2]
    noArabDF = bioGRID_DF[~mask1 & ~mask2]
    # Get list of genes so I can import the necessary ATTED Data. Note that the ATTED data has a text file by Entrez gene ID
    # WholeData:
    wholeGenesA = list(bioGRID_DF['Entrez Gene Interactor A'].unique())
    wholeGenesB = list(bioGRID_DF['Entrez Gene Interactor B'].unique())
    wholeGenesA.sort()
    wholeGenesB.sort()
    wholeGenes = list(set([*wholeGenesA, *wholeGenesB]))
    wholeGenes.sort()
    # Only Arabidopsis Subset
    ArabGenesA = list(onlyArabDF['Entrez Gene Interactor A'].unique())
    ArabGenesB = list(onlyArabDF['Entrez Gene Interactor B'].unique())
    ArabGenesA.sort()
    ArabGenesB.sort()
    ArabGenes = list(set([*ArabGenesA, *ArabGenesB]))
    ArabGenes.sort()
    # Read in only the Overlapping Genes
    # Get a list of all genes in ATTED
    atted = pd.read_csv('../../project_data/Ath-mB.v17-08.G20819-S16033.rma_combat.mrgeo.d/814630', sep = '\t', header = None)
    atted = atted.sort_values(by = 0)
    a1 = np.array(atted[0])
    attedGenes = list(a1)
    # Reading in of Overlapping Genes
    attedpath = 'C:\\Users\\ysman\\OneDrive\\Desktop\\project_data\\Ath-mB.v17-08.G20819-S16033.rma_combat.mrgeo.d\\'
    overlapGenes = []
    for i in range (len(wholeGenes)):
        if os.path.exists(attedpath+'{}'.format(wholeGenes[i])):
            overlapGenes.append(wholeGenes[i])
    DF = {0:attedGenes}
    for x in overlapGenes:
        tempAtted = pd.read_csv('../../project_data/Ath-mB.v17-08.G20819-S16033.rma_combat.mrgeo.d/{}'.format(x), sep = '\t', header= None)
        tempAtted = tempAtted.sort_values(by = 0)
        templist = list(tempAtted[1])
        DF.update({x:templist})
    attedData = pd.DataFrame(DF,dtype='float64')
    attedData = attedData.astype({0:'int'})
    attedData = attedData.set_index(0)
    pickle.dump(attedData, open(attedDataName, 'wb'))
    # Add ATTED MR Values to BioGRID
    genesGRID = simplebGRID.loc[:]
    genesGRID['MR']=np.zeros(len(genesGRID))
    # for gA,gB in zip(toyGRID.iloc[:,0],toyGRID.iloc[:,1]): is a way to iterate through 2 cols at once
    for i in range(genesGRID.shape[0]):
        gA = genesGRID.loc[i,'Entrez Gene Interactor A']
        gB = genesGRID.loc[i,'Entrez Gene Interactor B']
        if (gA in attedData.columns) and (gB in attedData.columns): # checks if both genes are part of the overlapping set
            mr = attedData.loc[gA,gB]
            genesGRID.loc[i,'MR'] = mr
        else:
            genesGRID.loc[i,'MR'] = np.nan
    pickle.dump(genesGRID, open(bioGRIDfileName, 'wb'))
    boolVal = False
else:
    # Load data using pickle
    attedData = pickle.load(open(attedDataName, 'rb'))
    genesGRID = pickle.load(open(bioGRIDfileName, 'rb'))
# Initializes Networkx Graph
genesG = nx.Graph()
# Adds edges for each interaction
for i in range(genesGRID.shape[0]):
    genesG.add_edge(genesGRID['Entrez Gene Interactor A'].loc[i], genesGRID['Entrez Gene Interactor B'].loc[i], weight = genesGRID['MR'].loc[i])

Wall time: 7min 15s


In [14]:
%%time
gA = 825075
gB = 818903
print('Distance between gene {} and gene {} is {}.'.format(gA,gB,nx.astar_path_length(genesG, gA, gB)))
print('This should output 13.')

Distance between gene 825075 and gene 818903 is 1.24.
This should output 13.
Wall time: 977 µs


## Input ABC transporters and LTPs
---
List of ABC transporters can be found at:
https://www.arabidopsis.org/browse/genefamily/ABC_merged.jsp
List of LTPS: general search (top right bar)= 'LTP'
https://www.arabidopsis.org/servlets/Search?type=general&search_action=detail&method=1&show_obsolete=F&name=LTP&sub_type=gene&SEARCH_EXACT=4&SEARCH_CONTAINS=1

Converted from TAIR_ID to Entrez using DAVID: https://david.ncifcrf.gov/conversion.jsp 

In [22]:
sysNamesA = list(bioGRID_DF['Systematic Name Interactor A'].unique())
sysNamesB = list(bioGRID_DF['Systematic Name Interactor B'].unique())
sysNames = list(set([*sysNamesA, *sysNamesB]))
sysNames.sort()

In [23]:
sysNames

['-',
 'AT1G01010',
 'AT1G01030',
 'AT1G01040',
 'AT1G01050',
 'AT1G01060',
 'AT1G01070',
 'AT1G01090',
 'AT1G01100',
 'AT1G01120',
 'AT1G01140',
 'AT1G01160',
 'AT1G01183',
 'AT1G01200',
 'AT1G01210',
 'AT1G01230',
 'AT1G01240',
 'AT1G01250',
 'AT1G01260',
 'AT1G01290',
 'AT1G01300',
 'AT1G01320',
 'AT1G01340',
 'AT1G01350',
 'AT1G01360',
 'AT1G01370',
 'AT1G01380',
 'AT1G01440',
 'AT1G01470',
 'AT1G01480',
 'AT1G01500',
 'AT1G01510',
 'AT1G01520',
 'AT1G01530',
 'AT1G01540',
 'AT1G01560',
 'AT1G01580',
 'AT1G01610',
 'AT1G01620',
 'AT1G01630',
 'AT1G01640',
 'AT1G01650',
 'AT1G01700',
 'AT1G01720',
 'AT1G01740',
 'AT1G01760',
 'AT1G01780',
 'AT1G01790',
 'AT1G01800',
 'AT1G01820',
 'AT1G01880',
 'AT1G01910',
 'AT1G01920',
 'AT1G01960',
 'AT1G01970',
 'AT1G01990',
 'AT1G02010',
 'AT1G02040',
 'AT1G02065',
 'AT1G02080',
 'AT1G02090',
 'AT1G02130',
 'AT1G02140',
 'AT1G02150',
 'AT1G02160',
 'AT1G02170',
 'AT1G02205',
 'AT1G02210',
 'AT1G02220',
 'AT1G02230',
 'AT1G02280',
 'AT1G02340',
