In [9]:
# Generate Regular Experessions
def gen_regexs(sequence, permute=False):
    """Take a simple list of sequences representing the laymen way to write a regular expression.
    
    Sequences are of the format: [nucleotide sequence, maximum gap length, nucleodtide sequence, max gap length, ...]
        
        eg, ['TCTG', 4, 'CAGA']
    
    This simple way of wtriting a potential sequence will be translated to the appropriate regular expression.
    
        eg, 'TCTG{1}[AGTC]{0,4}CAGA{1}'
        
    
    Permutations are whether we should generate a regular expression for each possible regular expression
    that is one edit-distance from the provided sequence.
    """
    nucs = set(['A','C','G','T'])
    wildcard = "[ACGT]"
    reglist = []
    if permute:
        pass
        """
        n = len([s for s in sequence if type(s)==str]) # number of letters in total sequence
        for i in range(n):
            reg = ''
            for s in sequence:
                if type(s) == str:
                    reg += s+'{1}'
                if type(s) == int:
                    reg += wildcard+'{0,'+str(s)+'}'
            # loop over the regex, checking for the right spot to insert a wildcard
            j = 0        
            for c in reg:
                if c in nucs:
                    if i == j:
                        c = wildcard+'{1}'
                        break
                    j += 1
            reglist.append(reg)
        """         
    else:
        reg = ''
        for s in sequence:
            if type(s) == str:
                reg += s+'{1}'
            if type(s) == int:
                reg += wildcard+'{0,'+str(s)+'}'
        reglist.append(reg)
    return reglist
                

In [50]:
# test gen_regexs
sample = ['TCTG', 4, 'CAGA']
regs = gen_regexs(sample)

In [55]:
import docx
import re

In [67]:
doc = docx.Document('tgfb1.docx')
data = ''.join([p.text for p in doc.paragraphs])
#print data
#print regs
for reg in regs:
    split = zip(re.split(reg,data), re.findall(reg, data))
#print split

In [68]:
scanned = docx.Document()
p = scanned.add_paragraph('')
for bad,good in split:
    # the inbetween sequences
    run = p.add_run(bad)
    run.font.name = "Courier New"
    run.font.size = docx.shared.Pt(10)
    # the tagged sequences
    run = p.add_run(good)
    run.bold = True
    run.font.name = "Courier New"
    run.font.size = docx.shared.Pt(14)
scanned.save('test.docx')

In [16]:
import pyensembl

In [17]:
data = pyensembl.EnsemblRelease()

In [20]:
g = data.genes_by_name('TGFB1')

In [29]:
print g[0].__dict__

{'end': 41353911, 'name': u'TGFB1', 'db': <pyensembl.database.Database object at 0x10f2ba8d0>, 'start': 41301587, 'biotype': u'protein_coding', 'ensembl': EnsemblRelease(release=79, species=homo_sapiens, genome=GRCh38), 'contig': u'19', 'id': 'ENSG00000105329', 'strand': u'-'}
