In [48]:
%%time
# Import necessary Libraries
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import pickle # used to store cleaned data
import numpy.ma as ma # masking for numpy Arrays
boolVal = False
attedDataName = 'Overlapping_GenesMR.p'
bioGRIDfileName = 'BioGRID_with_ATTEDMR.p'
if boolVal:
    # Reading in BioGRID as DataFrame
    path = os.path.join('C:\\Users\\ysman\\OneDrive\\Desktop\\project_data\\BIOGRID-ORGANISM-Arabidopsis_thaliana_Columbia-3.5.181.tab2.txt')
    bioGRID_file = open(path, "r") 
    testFile = open(os.path.join('C:\\Users\\ysman\\OneDrive\\Desktop\\project_directory\\data\\test.txt'), 'r')
    bioGRID_DF = pd.read_csv(bioGRID_file, sep = '\t')
    # Simplified DataFrame to only include interactions
    simplebGRID = bioGRID_DF[['Entrez Gene Interactor A','Entrez Gene Interactor B']]
    # Identifying Organisms Present in BioGRID
    OrganismTypesA = list(bioGRID_DF['Organism Interactor A'].unique())
    OrganismTypesB = list(bioGRID_DF['Organism Interactor B'].unique())
    OrganismTypesA.sort()
    OrganismTypesB.sort()
    OrganismTypes = list(set([*OrganismTypesA, *OrganismTypesB]))
    OrganismTypes.sort()
    # Use NCBI's Taxonomy Name/ID Status Report - Plug in OrganismTypes and get .txt
    organismIDs = pd.read_csv('../../project_data/tax_report.txt', sep = '\t')
    organismIDs.drop(columns = ['|','|.1','|.2', 'code', 'primary taxid'],inplace = True)
    # Categorize into different subsets based on organism ID. We know '3702' is Arabidopsis
    mask1 = bioGRID_DF['Organism Interactor A'] == 3702
    mask2 = bioGRID_DF['Organism Interactor B'] == 3702
    onlyArabDF = bioGRID_DF[mask1& mask2]
    oneArabDF = bioGRID_DF[~mask1|~mask2]
    noArabDF = bioGRID_DF[~mask1 & ~mask2]
    # Get list of genes so I can import the necessary ATTED Data. Note that the ATTED data has a text file by Entrez gene ID
    # WholeData:
    wholeGenesA = list(bioGRID_DF['Entrez Gene Interactor A'].unique())
    wholeGenesB = list(bioGRID_DF['Entrez Gene Interactor B'].unique())
    wholeGenesA.sort()
    wholeGenesB.sort()
    wholeGenes = list(set([*wholeGenesA, *wholeGenesB]))
    wholeGenes.sort()
    # Only Arabidopsis Subset
    ArabGenesA = list(onlyArabDF['Entrez Gene Interactor A'].unique())
    ArabGenesB = list(onlyArabDF['Entrez Gene Interactor B'].unique())
    ArabGenesA.sort()
    ArabGenesB.sort()
    ArabGenes = list(set([*ArabGenesA, *ArabGenesB]))
    ArabGenes.sort()
    # Read in only the Overlapping Genes
    # Get a list of all genes in ATTED
    atted = pd.read_csv('../../project_data/Ath-mB.v17-08.G20819-S16033.rma_combat.mrgeo.d/814630', sep = '\t', header = None)
    atted = atted.sort_values(by = 0)
    a1 = np.array(atted[0])
    attedGenes = list(a1)
    # Reading in of Overlapping Genes
    attedpath = 'C:\\Users\\ysman\\OneDrive\\Desktop\\project_data\\Ath-mB.v17-08.G20819-S16033.rma_combat.mrgeo.d\\'
    overlapGenes = []
    for i in range (len(wholeGenes)):
        if os.path.exists(attedpath+'{}'.format(wholeGenes[i])):
            overlapGenes.append(wholeGenes[i])
    DF = {0:attedGenes}
    for x in overlapGenes:
        tempAtted = pd.read_csv('../../project_data/Ath-mB.v17-08.G20819-S16033.rma_combat.mrgeo.d/{}'.format(x), sep = '\t', header= None)
        tempAtted = tempAtted.sort_values(by = 0)
        templist = list(tempAtted[1])
        DF.update({x:templist})
    attedData = pd.DataFrame(DF,dtype='float64')
    attedData = attedData.astype({0:'int'})
    attedData = attedData.set_index(0)
    pickle.dump(attedData, open(attedDataName, 'wb'))
    # Add ATTED MR Values to BioGRID
    genesGRID = simplebGRID.loc[:]
    genesGRID['MR']=np.zeros(len(genesGRID))
    # for gA,gB in zip(toyGRID.iloc[:,0],toyGRID.iloc[:,1]): is a way to iterate through 2 cols at once
    for i in range(genesGRID.shape[0]):
        gA = genesGRID.loc[i,'Entrez Gene Interactor A']
        gB = genesGRID.loc[i,'Entrez Gene Interactor B']
        if (gA in attedData.columns) and (gB in attedData.columns): # checks if both genes are part of the overlapping set
            mr = attedData.loc[gA,gB]
            genesGRID.loc[i,'MR'] = mr
        else:
            genesGRID.loc[i,'MR'] = np.nan
    pickle.dump(genesGRID, open(bioGRIDfileName, 'wb'))
    boolVal = False
else:
    # Load data using pickle
    attedData = pickle.load(open(attedDataName, 'rb'))
    genesGRID = pickle.load(open(bioGRIDfileName, 'rb'))
# Initializes Networkx Graph
genesG = nx.Graph()
# Adds edges for each interaction
for i in range(genesGRID.shape[0]):
    genesG.add_edge(genesGRID['Entrez Gene Interactor A'].loc[i], genesGRID['Entrez Gene Interactor B'].loc[i], weight = genesGRID['MR'].loc[i])

Wall time: 11.3 s


In [49]:
def nonConvertible(orig,trans): # Both are lists
    nc = []
    for i in range(len(orig)):
        if orig[i] not in trans:
            nc.append(orig[i])
        else:
            continue
    return nc

In [50]:
# Read In Data 
ABC_trans = pd.read_csv('../data/convABC_Genes.txt', sep = '\t')
ABC_trans = ABC_trans.rename(columns = {'From':'TAIR_ID','To':'ENTREZ_ID'})
ABC_orig = pd.read_excel('../data/ABC_Genes.xls', sheet_name = 'Sheet2')
abcAT = sorted(set(list(ABC_trans['TAIR_ID'])))
abcEntrez = sorted(set(list(ABC_trans['ENTREZ_ID'])))
abcAT_orig = sorted(set(list(ABC_orig['TAIR_ID'])))
abc_NC  = nonConvertible(abcAT_orig,abcAT)
#print('The non-convertable ABC TAIR-IDs are: {}.'.format(', '.join(abc_NC)))
print(f'The non-convertable ABC TAIR IDs are: {abc_NC}')

LTP_trans = pd.read_csv('../data/convLTP_Genes.txt', sep = '\t')
LTP_trans = LTP_trans.rename(columns = {'From':'TAIR_ID','To':'ENTREZ_ID'})
LTP_orig = pd.read_excel('../data/LTP_Genes.xlsx', sheet_name = 'Sheet1')
ltpAT = sorted(set(list(LTP_trans['TAIR_ID'])))
ltpEntrez = sorted(set(list(LTP_trans['ENTREZ_ID'])))
ltpAT_orig = sorted(set(list(LTP_orig['TAIR_ID'])))
ltp_NC  = nonConvertible(ltpAT_orig,ltpAT)
#print('The non-convertable LTP TAIR-IDs are: {}.'.format(', '.join(ltp_NC)))
print(f'The non-convertable LTP TAIR IDs are: {ltp_NC}')

The non-convertable ABC TAIR IDs are: ['ATMG00110', 'ATMG00900']
The non-convertable LTP TAIR IDs are: []


In [109]:
#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html

def existing(lst, reference): #checks if items of lst in reference, returns only existing values
    bools = []
    for x in lst:
        if x in reference:
            bools.append(0)
        else:
            bools.append(1)
    maskedLst = ma.masked_array(lst, mask = bools)
    return maskedLst.compressed()

In [110]:
def ShortestDistances(graphData,Graph,ABC,LTP): #using function caused Kernel to crash due to memory
    graphGenesA = list(graphData['Entrez Gene Interactor A'].unique())
    graphGenesB = list(graphData['Entrez Gene Interactor B'].unique())
    graphGenes = list(set([*graphGenesA, *graphGenesB]))
    abc = existing(ABC, graphGenes)
    ltp = existing(LTP,graphGenes)
    DF={0:ltp}
    for x in abc:
        valList = []
        for y in ltp:
            if (x in graphGenes) and (y in graphGenes): # checks if both genes are part of the overlapping set
                val = nx.astar_path_length(Graph, x,y)
            else:
                val = np.nan
            valList.append(val)
        DF.update({x:valList})
    Data = pd.DataFrame(DF,dtype='float64')
    Data = Data.astype({0:'int'})
    Data = Data.set_index(0)
    return Data

In [111]:
%%time
abc_ltp_shortDist = ShortestDistances(genesGRID,genesG,abcEntrez,ltpEntrez)

KeyboardInterrupt: 

In [112]:
%%time

graphGenesA = list(genesGRID['Entrez Gene Interactor A'].unique())
graphGenesB = list(genesGRID['Entrez Gene Interactor B'].unique())
graphGenes = list(set([*graphGenesA, *graphGenesB]))
abc = existing(abcEntrez,graphGenes)
ltp = existing(ltpEntrez,graphGenes)
DF={0:ltp}
for x in abc[:1]:
    valList = []
    for y in ltp:
        val = nx.astar_path_length(genesG, x,y)
        valList.append(val)
    DF.update({x:valList})
Data = pd.DataFrame(DF,dtype='float64')
Data = Data.astype({0:'int'})
Data = Data.set_index(0)

KeyboardInterrupt: 

In [114]:
abc

array([817232, 817351, 818031, 818265, 818312, 818520, 818533, 818768,
       819314, 819392, 820236, 820497, 820512, 820568, 820881, 821576,
       821661, 821712, 822149, 822465, 822467, 822468, 822519, 823932,
       824396, 824519, 824619, 824675, 824679, 825444, 825814, 827186,
       827187, 827661, 828702, 828850, 828981, 829483, 830144, 830269,
       830541, 831202, 831709, 832061, 833896, 834434, 835363, 835939,
       836195, 836200, 836292, 836296, 836607, 838087, 838363, 838568,
       839277, 839282, 839353, 839378, 839687, 839920, 840064, 841571,
       841575, 841761, 841876, 842281, 842633, 842763, 842852, 843013,
       843122, 843398, 843474])

In [107]:
ltp,bools = existing(ltpEntrez,graphGenes)
print(ltp)

[ 818435  818436  820024  821829  821930  824323  825317  827138  827182
  831705  836050  836051  837046 5008021]


In [115]:
nx.astar_path_length(genesG,817232,818435)

KeyboardInterrupt: 