# SNARE-Protein Project

In [3]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS

## If Any Known Proteins From SNARE Family Belongs to Mycobacterium Tuberculosis

### 1. Get SNARE family protein IDs from InterPro

In [14]:
# downloadProteinID function 
# input: 
#   None
# output:
#   the name of the file contains a list of proteins IDs downloaded from InterPro
def downloadProteinID ():
    url = "http://www.ebi.ac.uk/interpro/entry/IPR010989/proteins-matched?taxonomy=2&export=ids"
    r = requests.get(url, allow_redirects=True)
    open("SNARE_proteins_bacteria.txt", 'wb').write(r.content)
    return "SNARE_proteins_bacteria.txt"

# The file of protein ids download from InterPro
proteinIDs = downloadProteinID ()

### 2. Get organism names from Uniport

In [15]:
# searchUniport_stable function
# input:
#   filename: string
#             the filename of a list of Protein ids 
# output:
#   a file with all the organism names matched to the protein ids and the
#   corresponding freqency of apperance
# SideNote: This function accomplish the smae goal as searchUniport_fast, but
#           this function accomplish the goal with uniprot api, where the 
#           result is stable but rather slow, which could take up to 10 mins.
def searchUniport_stable (filename):
    outFile = open("organism.txt", "w")
    proteinIDs = open(filename, "r")
    orgDic = {}
    for protein in proteinIDs:
        url = "https://www.uniprot.org/uniprot/?query=id:" + protein + "&columns=organism&format=tab"
        print("before")
        result = requests.get(url).content
        print ("after")
        organism = str(result).split("\\n")[1]
        if organism in orgDic:
            orgDic[organism] = orgDic[organism] + 1
        else:
            orgDic[organism] = 1
    for org_freqs in sorted(orgDic, key=orgDic.get, reverse=True):
        outFile.write(str(orgDic[org_freqs]) + " " + org_freqs + "\n")

In [16]:
# searchUniport_fast function
# input:
#   filename: string
#             the filename of a list of Protein ids 
# output:
#   a file with all the organism names matched to the protein ids and the
#   corresponding freqency of apperance
# SideNote: This function accomplish the smae goal as searchUniport_stable, but
#           accomplish the goal with python's BeautifulSoup package, where the 
#           result could be unstable but faster than using Uniport API. 
def searchUniport_fast (filename):
    outFile = open("organism.txt", "w")
    proteinIDs = open(filename, "r")
    orgDic = {}
    for protein in proteinIDs:
        text = requests.get('http://www.uniprot.org/uniprot/' + protein).text
        soup = BS(text)
        title = soup.head.title.text
        organism = title.split(" - ")[2]
        if organism in orgDic:
            orgDic[organism] = orgDic[organism] + 1
        else:
            orgDic[organism] = 1
    for org_freqs in sorted(orgDic, key=orgDic.get, reverse=True):
        outFile.write(str(orgDic[org_freqs]) + " " + org_freqs + "\n")

Get organism names from Uniport. This could take a while depends on the function choice. 

In [17]:
searchUniport_fast(proteinIDs)

### 3. Search through the organism names to see if it contains Mycobacterium Tuberculosis

In [18]:
# queryOrg function
# input: 
#   queryName: string
#              the name of organism we want check if it's in the organism list
#   orgFile: string
#            the name of the file output by searchUniport function, which contains
#            all the organism names that have a specific protein fanmily
# output:
#   a boolean that tells if the query organism is in the organisms list
def queryOrg (queryName, orgFile):
    f = open(orgFile, "r")
    for org in f:
        lowerOrg = org.lower()
        qLower = queryName.lower()
        if qLower in lowerOrg:
            return True
    return False

In [20]:
# Determine if TB is in the orgnanism list
query = "Mycobacterium tuberculosis"
organism = "organism.txt"
result = queryOrg(query, organism)
print ("Mycobacterium tuberculosis is in the name list: " + str(result))

Mycobacterium tuberculosis is in the name list: False


### 4. Conclusion

## Blast SNARE-like proteins against Mycobacterium tuberculosis

Import a list of all (most) predicted "Incs" from C. trachomatis from the paper *Expression and Localization of Predicted Inclusion Membrane Proteins in Chlamydia trachomatis*

In [10]:
Incs_proteins = pd.read_csv("C.trachomatis_predict_Incs.txt", sep = " ", index_col = False)
Incs_proteins.head()

Unnamed: 0,D/UW-3/CX,L2/434/Bu,A/HAR-13
0,CT005,CTL0260,CTA0006
1,CT006,CTL0261,CTA0007
2,CT036,CTL0291,CTA0038
3,CT058,CTL0314,CTA0062
4,CT079,CTL0335,CTA0084


Change the locus tag for D/UW-3/CX strain in order to match NCBI locus tag format and store the changed tags into DUW-3CX_predict_Incs.txt file using unix command:
```shell
cut -d " " -f 1 C.trachomatis_predict_Incs.txt | sed s/CT/CT_/g > DUW-3CX_predict_Incs.txt
```

In [22]:
DUW3CX_proteins = pd.read_csv("DUW-3CX_predict_Incs.txt", sep = " ", index_col = False)
DUW3CX_proteins.head()

Unnamed: 0,D/UW-3/CX
0,CT_005
1,CT_006
2,CT_036
3,CT_058
4,CT_079
