# Motif finding

### Search against a set of sequences

In [None]:
import urllib.request

# This function allows us to compare two words of the same length to see if they
# are made up of the same letters in the same order

# The name of the function is "checkMatch".  It takes in a part of the original sequence
# (subsequence), the motif of interest (motif), and the length of the motif
def checkMatch(subsequence, motif, motifLength):
    # For each consecutive position in the motif (visiting each letter left to right)
    for position in range(motifLength):
        # If the symbol in the motif at that position isn't an X 
        # and the symbol in the motif at that postion doesn't match what's 
        # in the subsequence at the same position,
        # then return that these two words don't match
        #print("Comparing motif symbol", motif[position], "to sequence symbol", subsequence[position])
        if ((motif[position] != 'X') and (motif[position] != subsequence[position])):
            return False;
    # if you make it through all positions (all the way through the motif word)
    # without any mismatches, then return true, they do match
    return True;

# This function allows us to search for a motif within a sequence by
# repeatedly extracting subsequences of the sequence and comparing
# each subsequence to the motif
# The name of the function is "analyzeSequence".  It takes a sequence
# (equence) and the motif of interest (motif)
# NEW: Instead of printing that it found a match, it now returns True or False
def analyzeSequence(sequence, motif):
    # This will be used to hold a small piece of the sequence we are searching in
    subsequence = ""

    # Figure out how long the sequence is, in number of symbols
    sequenceLength = len(sequence)
    # Figure out how long the motif is, in number of symbols
    motifLength = len(motifOfInterest)

    # This represents where we are in the sequence we are searching through
    # We will walk down the sequence, one symbol at a time left-to-right, pulling out
    # a subsequence starting whereever we are.  The subsequence will be pulled out
    # to be the same size as the motif
    sequencePosition = 0

    # For each position in the sequence (except the far right end, where the subsequences are too short)
    for sequencePosition in range(sequenceLength-motifLength+1):
        # Pull out a subsequence of the sequence
        subsequence = sequence[sequencePosition:(sequencePosition + motifLength)]
        # See if the subsequence and the motif match by using our checkMatch functoin
        matches = checkMatch(subsequence,motifOfInterest,motifLength)
        # If they do match, print out that we found an instance of the motif in the sequence
        if (matches == True):
            # print("Got a match",subsequence)
            return True;

    return False;

# This function downloads a FASTA-formatted sequence file off the WWW
# and breaks the data in that file apart into a collection of 
# descriptions (the lines that start with > in FASTA files)
# and sequences.
# The name of the function is fetchData. It has no inputs.
def fetchData():
    # This opens a connection to a website, fetches the specified file, and breaks the
    # returned data apart by lines
    url = 'http://csweb.cs.wfu.edu/~turketwh/CSC385/Fall2018/practicalDataset.fasta'
    response = urllib.request.urlopen(url)
    data = response.read()  
    text = data.decode('utf-8')
    lines = text.splitlines()

    # This creates two empty dictionaries (hashtables) to hold the descriptions
    # and sequences
    descriptions = {}
    sequences = {}
    
    # These variables are used to find each description line and store it
    # and to find the 60-character long lines that go with each sequence, merge them into 
    # one long sequence string, and then store that string for each sequence
    line = ""
    title = ""
    sequence = ""
    
    # This keeps track of how many sequences we have seen
    sequenceCount = 0

    first = True
    # This says for each line in the file
    for position in range(len(lines)):
        line = lines[position]
        # If the line is a description line
        if (line[0] == '>'):
            # If this is not the first description line,
            # I've just seen the data for a sequence, so save it
            # and restart
            if (first == False):
                descriptions[sequenceCount] = title
                sequences[sequenceCount] = sequence
                # Record the next description
                title = line
                sequence = ""
                # Update how many sequences I've seens
                sequenceCount = sequenceCount + 1
            else:
                title = line
                first = False
        else:
            # Add the 60-char sequence line I just read to a large sequence string
            sequence = sequence + line
            
    # At the bottom of the file, store the last description and sequence
    descriptions[sequenceCount] = title
    sequences[sequenceCount] = sequence

    return sequences

############ CHANGE HERE ############
# This represents the motif we are searching for
motifOfInterest = "XXXX"
# This represents the data we will look through
sequenceList = fetchData()

# This variable keeps track of how many matches of the motif were made against the sequence set
countOfHits = 0
# For each sequence
for key in sequenceList:
    sequenceOfInterest = sequenceList[key]
    # Search for the motif and if I find it
    if (analyzeSequence(sequenceOfInterest,motifOfInterest) == True):
        # Add 1 to my counter of how many times I've found the motif
        countOfHits = countOfHits + 1

# This prints the results
print("In ", len(sequenceList), "sequences searched found motif",motifOfInterest,countOfHits,"times")