### Helper functions: 
These are the necessary helper functions to run the three metadata processing files. This file does not actually execute any commands but defines the necessary functions and classes to be executed in the pipeline. 

In [3]:
# Takes in a filepath to a tsv file for information pulled from NCBI using dataset/dataformat. 
# Read it using pandas, rename the columns to make it easier to work with, and set the index to the accession number. 
def processNcbiMetadata(filepath): 
    my_data = pd.read_table(filepath)
    my_data = renameColumns(my_data) 
    my_data = my_data.set_index('Accession', drop = True) 
    return(my_data) 

In [4]:
# Prune the metadata to keep only one assembly per species, selecting for the earliest publication date. 
def metadataEarliest(metadata): 
    earliest = (metadata.sort_values('First_Publication_Date')
                        .drop_duplicates('Organism_Name', keep = 'first'))
    earliest = earliest.set_index('Accession', drop = True) 
    return(earliest)

In [5]:
# Prune the metadata to keep only one assembly per species, selecting for the most recent publication date. 
def metadataNewest(metadata, sort_by = 'Release_Date'): 
    newest = (metadata.sort_values(sort_by, ascending = True)
                      .drop_duplicates('Organism_Name', keep = 'first'))
    newest = newest.set_index('Accession', drop = True) 
    return(newest)

In [6]:
# Rename the columns to make it easier to work with. 
# Remove the substrings "Assembly " and "Stats " from the column for more concise descriptors. 
# Replace space with underscore such that columns can be referenced using Python's dot syntax. 
def renameColumns(dataframe): 
    newNames = {}
    for colName in dataframe.columns: 
        newName = (colName.replace('Assembly ', '')
                          .replace('Stats ', '')
                          .replace(' ', '_'))
        newNames[colName] = newName
    dataframe = dataframe.rename(columns = newNames)
    return(dataframe)

In [7]:
# Create a new column based on the "Sequencing_Tech" column to simplify the information into sequencing type (long or short). 
def readType(metadata): 
    metadata['Sequencing_Tech'] = metadata['Sequencing_Tech'].astype(str) 
    long_reads = ['pacbio', 'nanopore']
    short_reads = ['illumina', 'hi-c', 'hic', 'iontorrent', 'sanger', 'hiseq', '10x', '454'] 
    metadata['Sequencing_Type'] = 'No information provided' 
    is_short = [any(company in sequencing_type for company in short_reads)
                for sequencing_type in metadata.Sequencing_Tech.str.lower()] 
    metadata.loc[is_short, 'Sequencing_Type'] = 'Short read' 
    is_long = [any(company in sequencing_type for company in long_reads) 
               for sequencing_type in metadata.Sequencing_Tech.str.lower()]
    metadata.loc[is_long, 'Sequencing_Type'] = 'Long read'

    
    return(metadata)

In [8]:
# Set the datatypes for columns of a pandas dataframe. 
def prepareMetadata(metadata): 
    dates = [column for column in metadata.columns if 'Date' in column]
    for column in dates: 
        metadata[column] = pd.to_datetime(metadata[column])
    return(metadata)

In [None]:
# Get the clade of an NCBI record based on the clade name.
def get_clade(record, clade): 
    for entry in record['LineageEx']: 
        if clade in entry.values(): 
            return entry['ScientificName']  
    return 'N/A'

In [9]:
# Taking in a Pandas DataFrame of NCBI metadata, crawl the NCBI website to find any extra information that may not be as easy to find using command line tools. 

class Metadata: 
    def __init__(self, my_data): 
        self.table = my_data
        self.accessions = self.table.index
        self.data = {}
        self.tax_data = {}
    
    ## Find the ncbi page for each accession in the given table and pull all the information into self.data
    def threadCreep(self): 
        with concurrent.futures.ThreadPoolExecutor() as executor: 
            results = [executor.submit(self.addAccession, accession) for accession in self.accessions]
            done = 0    
            for f in concurrent.futures.as_completed(results): 
                done = done + 1 
                print(f'\rComplete: {done}/{len(self.table)}', end = '\r')
                sys.stdout.flush()
        
    def taxCreep(self): 
        self.table['Phylum'] = 'N/A'
        with concurrent.futures.ThreadPoolExecutor() as executor: 
            results = [executor.submit(self.getPhyla, accession) for accession in self.accessions] 
            done = 0 
            for f in concurrent.futures.as_completed(results): 
                done = done + 1
                print(f'\rComplete: {done}/{len(self.table)}', end = '\r')
                sys.stdout.flush()
            
    def firstPub(self, date_column = 'Release_Date'): 
        # isolate accessions ending in characters other than 1 to check for original publications 
        multi_pub_candidates = self.accessions[[accession[-1] != '1' for accession in self.accessions]]

        with concurrent.futures.ThreadPoolExecutor() as executor: 
            first_pub = {accession : executor.submit(self.getFirstPub, accession) for accession in multi_pub_candidates} 
            done = 0    
            for f in concurrent.futures.as_completed(first_pub.values()): 
                done += 1 
                print(f'\rComplete: {done}/{len(multi_pub_candidates)}', end = '\r')
                sys.stdout.flush()
                sys.stdout.flush()
            first_pub = {accession : future.result() for accession, future in first_pub.items()}
            
            # Cut out any entries where the first pub accession just results in an empty list
            first_pub = {accession : accession_list for accession, accession_list in first_pub.items() if accession_list}
            
            first_pub_date = {accession : self.lookFor('Date', accession_list)
                                              .replace('/', '-') for accession, accession_list in first_pub.items()} 
            first_contig_n50 = {accession: self.lookFor('Contig N50', accession_list)
                                               .replace(',', '') for accession, accession_list in first_pub.items()} 
            self.table['First_Publication_Date'] = self.table[date_column].replace(first_pub_date)
            self.table['Original_Contig_N50'] = self.table['Contig_N50'].replace(first_contig_n50)
            self.first_pub = first_pub
                
        
    def getPhyla(self, accession): 
        url = self.getTaxURL(self.data[accession][1]) 
        page = requests.get(url).content.decode() 
        tax_list = self.webFilter(page) 
        tax_list = [word 
                    for line in tax_list 
                    for word in line.split()]
        tax_list = [line for line in tax_list if 'TITLE="' in line]
        self.tax_data[accession] = tax_list
        phylum_line = [line for line in tax_list if 'TITLE="phylum"' in line] 
        if len(phylum_line)==1: 
            phylum_line = phylum_line[0]
            start_id = phylum_line.find('>') + 1
            phylum = phylum_line[start_id:] 
            #print(phylum) 
        else: 
            phylum = "n/a" 
        self.table.loc[accession, 'Phylum'] = phylum
        
    def getFirstPub(self, accession): 
        first_accession = accession[:len(accession)-1] + '1' 
        first_accList = self.getAccList(first_accession) 
        return(first_accList) 
            
    def lookFor(self, request, searchList): 
        if request in searchList: 
            index = searchList.index(request) + 1
            info = self.removeLink(searchList[index]) 
            return(info) 
        else: 
            return('N/A')
        
    ### Remove hyperlink from HTML format 
    def removeLink(self, string): 
        if string.find('<a') == -1: 
            return string
        else: 
            string = string[string.find('>') + 1:]
            return string
        
    def getTaxURL(self, string): 
        urlStart = string.find('"') + 1
        string = string[urlStart:]
        urlEnd = string.find('"')
        string = string[:urlEnd]
        url = "https://www.ncbi.nlm.nih.gov" + string
        return(url)
    
    def addAccession(self, accession): 
        accList = self.getAccList(accession)
        self.data[accession] = accList 
        
    def getAccList(self, accession): 
        url = 'https://www.ncbi.nlm.nih.gov/assembly/' + accession
        page = requests.get(url).content.decode()
        accList = self.ncbiFilter(page)
        return(accList)        
            
    def ncbiFilter(self, content): 
        startID = content.find('Organism name') 
        content = content[startID:]
        endID = content.find('id="messagearea_bottom">')-5
        content = content[:endID]
        extras = [': ']
        content = self.webFilter(content, extras)
        return(content)
    
    ### Filter out HTML formatting to make the information more human friendly
    def webFilter(self, content, extras = []): 
        htmlStuff = ['</div>', '</span>', '</a>', '</dt>', '<dd>', '</dd>', '<dt>', '</h1>', '</li>', '<li>', 
                     '<td>', '</td>', '<tl>', '<tr>', '</tr>', '<td class="align_r">','<span>', '<tbody>', '<em>', 
                     '<table>', '</tbody>', '</table>', '</dl>', 'ALT=', '</em>'] + extras
        for littleString in htmlStuff: 
            content = content.replace(littleString, '\n')
        niceList = list()
        safetynet = 0
        while content.find('\n') != -1 and safetynet < 10000: 
            nextLine = content.find('\n')
            blob = content[:nextLine]
            if len(blob) > 0: 
                niceList.append(blob)
            content = content[nextLine+1:]
            safetynet = safetynet + 1  ## You can remove this but I have a fear of while loops
        return(niceList)