In [1]:
import sys
import json
import requests
from urllib.request import urlopen
import csv
import os
import os.path

In [2]:
def get_prot_name(filename):
    
    # Print the description lines of a BLASTX file
    blast_file = open(filename, 'r')
    line = blast_file.readline()

    # Sanity check that this is a BLASTX output text file
    if not line.startswith("BLASTX"):
        #raise TypeError("Not a BLASTX file: %r" % version)
        print("Not a BLASTX file: %r" % version)
        print(None)

    content = blast_file.readlines()
    lines = []

    for line in content:

        if line.startswith(">"):
            prot_name = line[1:].strip('\n ')
            prot_name = prot_name[:prot_name.find(' [')]
            prot_name = prot_name[prot_name.find(' '):][1:]
            lines.append(prot_name)

        if not line:
            # End of file - this should not happen
            print("Could not find the description section")
            print(None)

    prot_names = []

    # Return proteins
    for line in lines:
            prot_names.append(line)
            
    prot_names = list(dict.fromkeys(prot_names))
    return prot_names


def get_full_annotation(trinityId, protName, degGenes):
    
    #print(protName)
    protUni = str(protName.replace(' ', '+'))
    
    unknowList = ['hypothetical', 'uncharacterized', 'unknown', 'unnamed']
    if not any(term in protName.lower() for term in unknowList):

        # First get the Uniprot data
        link = 'https://www.uniprot.org/uniprot/?query=%22' + protUni + '%22,&columns=id,entry%20name&format=tab&sort=score'
        #print(link)
        html = urlopen(link).readlines()

        protId = ''
        geneId = ''

        if html != []:

            for entry in html:
                decodedHtml = entry.decode("utf-8")
                #print(decodedHtml)
                if '_ARATH' in decodedHtml:
                    protId = decodedHtml.split()[1]
                    break

            if protId == '':
                decodedHtml = html[1].decode("utf-8")
                protId = decodedHtml.split()[1]

            urllink = 'https://www.uniprot.org/uniprot/' + protId + '.txt'
            #print(urllink)
            url = urlopen(urllink)

            urlheader = url.readline().decode("utf-8").strip('\n').split()
            urltxt = url.readlines()

            goCC = []
            goMF = []
            goBP = []

            for entry in urltxt:
                line = str(entry.decode("utf-8")).strip('\n')[:-1]

                if line.startswith('GN   Name='):
                    geneId = line[10:].split(';')[0].strip()
                elif line.startswith('GN   ORFNames='):
                    geneId = line[14:].split(';')[0].strip()
                elif line.startswith('GN   OrderedLocusNames='):
                    geneId = line[23:].split(';')[0].strip()
                
                if geneId.find('{') != -1:
                    geneId = geneId[:geneId.find('{')].strip()

                if line.startswith('DR   GO'):

                    goName = ((line[9:].split('; ')[1:-1]))

                    if goName[0].startswith('C:'):
                        goCC.append(goName[0].replace('C:', ''))

                    elif goName[0].startswith('F:'):
                        goMF.append(goName[0].replace('F:', ''))

                    elif goName[0].startswith('P:'):
                        goBP.append(goName[0].replace('P:', ''))

            strGoCC = ', '.join(goCC)
            strGoMF = ', '.join(goMF)
            strGoBP = ', '.join(goBP)
            
            if geneId == '':
                geneId = 'N/A'

            tabRow = {trinityId: [geneId, protName, protId, strGoCC, strGoMF, strGoBP]} # degFC
            print(tabRow)

            return tabRow
        
        else:
            return None

    
    else:
        tabRow = {trinityId: ['Unknown', protName]} # degFC
        print(tabRow)
        degGenes.update(tabRow)

        return degGenes


def get_ancestors(go_ids):
    
    ancestors = {}
    
    for go_id in go_ids:
    
        the_url = 'https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/' + go_id
        response = requests.get(the_url)
        data = json.loads(response.text)

        record = data['results'][0] # Get the first result, because we are only requesting one!

        # Return a dictionary that contains the information of this GO term
        go_info = {
            'go_id': record['id'],
            'aspect': record['aspect'],
            'name': record['name'],
            'definition': record['definition']['text'],
            'obsolete': record['isObsolete'],
        }
        
        go_infos.update({go_id: [go_info[aspect], go_info[name]]})

    return go_infos


In [25]:
trinity = 'TRINITY_DN349_c1_g1'
prot = 'glutathione S-transferase'

protUni = str(prot.replace(' ', '+'))
print(protUni)

# First get the Uniprot data
link = 'https://www.uniprot.org/uniprot/?query=%22' + protUni + '%22,&columns=id,entry%20name&format=tab&sort=score'
print(link)
html = urlopen(link).readlines()

protId = ''
geneId = ''

if html != []:
    print(html)

    for entry in html:
        decodedHtml = entry.decode("utf-8")
        print(decodedHtml)
        if '_ARATH' in decodedHtml:
            protId = decodedHtml.split()[1]
            break

    if protId == '':
        decodedHtml = html[1].decode("utf-8")
        protId = decodedHtml.split()[1]

    urllink = 'https://www.uniprot.org/uniprot/' + protId + '.txt'
    #print(urllink)
    url = urlopen(urllink)

    urlheader = url.readline().decode("utf-8").strip('\n').split()
    urltxt = url.readlines()

    goCC = []
    goMF = []
    goBP = []

    for entry in urltxt:
        line = str(entry.decode("utf-8")).strip('\n')[:-1]

        if line.startswith('GN   Name='):
            geneId = line[10:].split(';')[0].strip()
        elif line.startswith('GN   ORFNames='):
            geneId = line[14:].split(';')[0].strip()
        elif line.startswith('GN   OrderedLocusNames='):
            geneId = line[23:].split(';')[0].strip()

        if geneId.find('{') != -1:
            geneId = geneId[:geneId.find('{')].strip()

        if line.startswith('DR   GO'):

            goName = ((line[9:].split('; ')[1:-1]))

            if goName[0].startswith('C:'):
                goCC.append(goName[0].replace('C:', ''))

            elif goName[0].startswith('F:'):
                goMF.append(goName[0].replace('F:', ''))

            elif goName[0].startswith('P:'):
                goBP.append(goName[0].replace('P:', ''))

    strGoCC = ', '.join(goCC)
    strGoMF = ', '.join(goMF)
    strGoBP = ', '.join(goBP)

    if geneId == '':
        geneId = 'N/A'

    tabRow = {trinityId: [geneId, protName, protId, strGoCC, strGoMF, strGoBP]} # degFC
    print(tabRow)

    return tabRow

else:
    return None

print(degGenes) 

glutathione+S-transferase
https://www.uniprot.org/uniprot/?query=%22glutathione+S-transferase%22,&columns=id,entry%20name&format=tab&sort=score


KeyboardInterrupt: 

In [23]:
path = 'Blast/blastx_out/'
files = os.listdir(path)
#print(files)

unknowList = ['hypothetical', 'uncharacterized', 'unknown', 'unnamed']

for file in files:
    filename = ''.join(path + file)
    trinity = file.replace('_blastx', '')

    if trinity not in degGenes:
        print(trinity)
        
        annotation = None
        protList = get_prot_name(filename)

        if protList != []:

            protKnownList = []

            for prot in protList:
                if not any(term in prot.lower() for term in unknowList):
                    protKnownList.append(prot)

            if protKnownList != []:
                for prot in protKnownList:
                    print(prot)
                    annotation = get_full_annotation(trinity, prot, degGenes)
                    print(annotation)
                    if annotation != None:
                        break

            else:
                for prot in protList:
                    annotation = get_full_annotation(trinity, prot, degGenes)
                    if annotation != None:
                        break

            if annotation != None:
                degGenes.update(annotation)

        else:
            annotation = {trinity: ['Unkown', 'Unkown protein']}
            degGenes.update(annotation)   

TRINITY_DN349_c1_g1
glutathione S-transferase


KeyboardInterrupt: 

In [15]:
degGenes

{'TRINITY_DN12535_c0_g2': ['At2g25060',
  'early nodulin-like protein 1',
  'ENL1_ARATH',
  'anchored component of membrane, anchored component of plasma membrane, plasma membrane, plasmodesma, secretory vesicle, vacuole',
  'electron transfer activity',
  ''],
 'TRINITY_DN11878_c0_g1': ['CKAN_01401800',
  'UDP-glycosyltransferase 90A1-like',
  'A0A443P332_9MAGN',
  '',
  'transferase activity, transferring hexosyl groups',
  ''],
 'TRINITY_DN68706_c0_g1': ['STAS_14199',
  'acyl-CoA N-acyltransferases super family protein',
  'A0A5A7PYZ1_STRAF',
  '',
  'transferase activity, transferring acyl groups',
  ''],
 'TRINITY_DN13077_c0_g5': ['COBL4',
  'COBRA-like protein 4',
  'COBL4_ARATH',
  'anchored component of membrane, anchored component of plasma membrane, plasma membrane, vacuolar membrane',
  '',
  'cellulose microfibril organization, plant-type cell wall biogenesis, plant-type cell wall cellulose biosynthetic process, plant-type cell wall organization, plant-type secondary cell w

In [None]:
fileannot = 'DEGannotations_0702.tab'

with open(fileannot, 'w') as annotFile:
    writer = csv.writer(annotFile, dialect="excel-tab")
    writer.writerow(['Trinity Id', 'Gene Id', 'Protein Name', 'Prot Id', 'Cellular Component', 'Molecular Function', 'Biological Process'])
    
    for gene in degGenes:
        writer.writerow([gene, degGenes[gene]])
        
annotFile.close()

In [None]:
for file in files:
    if file.replace('_blastx','') not in degGenes:


In [None]:
uniprotIdList = []

for gene in degGenes:
    if len(degGenes[gene]) > 2:
        uniprotIdList.append(degGenes[gene][2])
", ".join(uniprotIdList).strip()

In [7]:
mol_function = {}

for gene in degGenes:
    if len(degGenes[gene]) > 2:
        mf_list = degGenes[gene][4].split(',')
        
        for function in mf_list:
            if function != '':
                if function not in mol_function:
                    mol_function[function.strip()] = 1
                else:
                    mol_function[function.strip()] += 1

mol_function
{k: v for k, v in sorted(mol_function.items(), key=lambda item: item[1], reverse=True)}

#sorted_x.reverse()

{'transferase activity': 2,
 'electron transfer activity': 1,
 'transferring hexosyl groups': 1,
 'transferring acyl groups': 1,
 'mRNA binding': 1}

In [None]:
for gene in degGenes:
    if len(degGenes[gene]) > 2:
        print(degGenes[gene][0])
    else:
        continue

In [10]:
fileannot = 'DEGannotations_0502.tab'

d = {}
with open(fileannot, 'r') as annotFile:
    header = annotFile.readline()
    for row in csv.reader(annotFile, delimiter='\t'):
        d[row[0]] = row[1:]
        
annotFile.close()

d

{'At2g25060': ['early nodulin-like protein 1',
  'ENL1_ARATH',
  'anchored component of membrane, anchored component of plasma membrane, plasma membrane, plasmodesma, secretory vesicle, vacuole',
  'electron transfer activity',
  ''],
 'CKAN_01401800': ['UDP-glycosyltransferase 90A1-like',
  'A0A443P332_9MAGN',
  '',
  'transferase activity, transferring hexosyl groups',
  ''],
 'STAS_14199': ['acyl-CoA N-acyltransferases super family protein',
  'A0A5A7PYZ1_STRAF',
  '',
  'transferase activity, transferring acyl groups',
  ''],
 'COBL4': ['COBRA-like protein 4',
  'COBL4_ARATH',
  'anchored component of membrane, anchored component of plasma membrane, plasma membrane, vacuolar membrane',
  '',
  'cellulose microfibril organization, plant-type cell wall biogenesis, plant-type cell wall cellulose biosynthetic process, plant-type cell wall organization, plant-type secondary cell wall biogenesis'],
 'LOC103697761': ['ras GTPase-activating protein-binding protein 1-like',
  'A0A2H3X3V9_PH