In [None]:
#NOTES:
#To get a list of PDB that contains histones from 'text.csv':
#cut -f1 text.tsv | uniq | awk '{print tolower($0)}' | sort

#text.tsv should be sorted by pdb and then uniprot name
#it MUST have 'NA' in uniprot and name blanks

In [None]:
#!/usr/bin/env python 3
import re

#PATH = "/net/pan1/interactomes/pipeline/Interactome/Workflow/Interfaces/"
PATH = "../data/Interfaces/"
CHAIN_FILE = "text.tsv"
PDB_LIST = "pdbList.txt"

In [23]:
#PARAMETERS:
#fils is a string with path to the file to be checked

#RESULTS:
#Returns 1 or 0 depending on file existence 


def file_check(file):
    
    try:
        open(file, "r")
        return 1
    
    except IOError:
        print("Error: " + file + " does not appear to exist.")
        return 0

In [24]:
#PARAMETERS: 
#pdbList is a text file with a header and one column PDB
#files is a list
#parameter is a string, either 'mapping' or 'interface' depending on desired results

#RESULTS:
#A list of absolute paths to either mapping files or interface files as it is stored on local NCBI machines


def get_files(pdbList, files, parameter):
    
    with open(pdbList, 'r') as pfh:
        pfh.readline()
        
        if(parameter == 'mapping'):
            
            for line in pfh:
                line = line.strip()
                folder = line[1] + line[2] 
                files.append(PATH + folder + '/' + line + '_chain_protein_mapping.tab')
                
        elif(parameter == 'interface'): 
            
            for line in pfh:
                line = line.strip()
                folder = line[1] + line[2]
                files.append(PATH + folder + '/' + line + '_atomic_contacts_5.0A.tab')

In [25]:
#PARAMETERS:
#cFile is tab-separated file with a header and 4 columns: pdb, chain, uniprot, name
#dictionary is nested with the innermost dict being dictionary['pdb'] = {}

#RESULTS:
#The format of the end-product dictionary is: {pdb : {AlexChain: myChain#UNIPROT#name}}
#Example: {1alq : {'G': 'E#P02302#Histone H3.3C'}}


def get_chain_dictionaries(cFile, dictionary):  
    with open(cFile, 'r') as cfh:
        cfh.readline() #skips header
        
        for cLine in cfh:
            fields = cLine.strip().split('\t')
            pdb = fields[0]
            folder = pdb[1] + pdb[2] #folder name in ". . . Interfaces/"        
            
            tempDict = {}
            
            mappingFile = PATH + folder + '/' + pdb + '_chain_protein_mapping.tab'
            
            try: #adds a pdb entry to the dict only if mapping file exists
                
                with open(mappingFile, 'r') as mfh:
                    mfh.readline() #skips header
                    
                    for mLine in mfh:
                        chainPair = mLine.split('\t', 2)
                        alexChain = chainPair[0]
                        myChain = chainPair[1]

                        tempDict[alexChain] = myChain
                
                dictionary[pdb] = {}
                pdbTemp = pdb
                histoneCount = 0 #is used to count number of histones in a structure
                
                while(pdbTemp == pdb): #parses lines that have the same pdb (must be sorted)
                    chain = fields[1]
                    uniprot = fields[2]
                    name = fields[3]

                    if(not(uniprot == 'NA' and name == 'NA')): #prevents from wasting time on nucleotide chains
                        histoneMatch = re.findall(r'histone h2a.z-specific chaperone chz1|histone h|histone-like|histone macro|histone.*peptide|like histone|h\d.*peptide|h3K4me0|$h\d^|archael histone|peptide.*histone', name, re.I)
                        histoneCount += len(histoneMatch) #adds the number of histones in chain
                        
                        try: #adds a chain entry to the dict only if there exists a corresponding chain in the mapping file
                            alexChain = list(tempDict.keys())[list(tempDict.values()).index(chain)]
                            dictionary[pdb][alexChain] = str(tempDict[alexChain]) + '#' + uniprot + '#' + name

                        except ValueError:
                            pass
                            #print("Error: " + ValueError + ", in " + pdb)
                            
                    cLine = cfh.readline()
                    
                    fields = cLine.strip().split('\t')
                    pdbTemp = fields[0]
                    
            except IOError:    
                #print("Error: " + mappingFile + " does not appear to exist.")

                pdbTemp = pdb
                
                while(pdbTemp == pdb): #skips lines with pdb that doesn't have a corresponding mapping file
                    cLine = cfh.readline()
                    fields = cLine.strip().split('\t', 1)
                    pdbTemp = fields[0]

In [26]:
def main():

    chainDictionary = {}
    chainDictionary['pdb'] = {}
    
    get_chain_dictionaries(CHAIN_FILE, chainDictionary)
    
    
    interfaceFiles = []
    get_files(PDB_LIST, interfaceFiles, 'interface')

In [27]:
if __name__ == "__main__":
    main()

{'pdb': {}}
