In [1]:
import pandas as pd
import numpy as np
import re

from pathlib import Path


In [2]:
data_path = Path('../data').resolve()

# Extract lineages from pangolin lineages file

The pangolin tool takes a while so let's just use their released data, and only run pangolin on the sequences that they missed. Or maybe they missed them for a reason, and we should just ignore them. Either way we have to match it up here for now

In [6]:
# Get processed fasta files
fasta_files = sorted((data_path / 'fasta_processed').glob('*.fasta'))
fasta_files

[PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0101-0131.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0201-0229.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0301-0331.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0401-0407.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0408-0414.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0415-0421.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0422-0430.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0501-0507.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0508-0514.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_processed/gisaid_0515-0519.fasta')]

In [7]:
# Load lineages metadata
lineage_df = pd.read_csv(data_path / 'pangolin_lineages_20200525.csv')
lineage_df

Unnamed: 0,GISAID ID,name,country,travel history,sample date,epiweek,lineage,representative
0,EPI_ISL_420793,USA/NY_2929/2020,USA,,2020-03-02,10.0,B.1,1
1,EPI_ISL_417142,USA/WA-S89/2020,USA,,2020-02-29,9.0,A.1,1
2,EPI_ISL_420792,USA/NH_0008/2020,USA,,2020-03-02,10.0,B.1,1
3,EPI_ISL_420795,USA/RI_0556/2020,USA,,2020-03-01,10.0,B.1.5,1
4,EPI_ISL_417140,USA/WA-S87/2020,USA,,2020-03-01,10.0,A.1,1
...,...,...,...,...,...,...,...,...
27758,,Scotland/CVR04/2020,UK,,2020-03-04,10.0,B,0
27759,,Scotland/CVR05/2020,UK,,2020-03-04,10.0,B.1,0
27760,,Scotland/CVR03/2020,UK,,2020-03-01,10.0,B.1,0
27761,,Scotland/CVR02/2020,UK,,2020-03-02,10.0,B,0


The lineage dataframe has incomplete GISAID IDs, so we'll have to rely on the name instead

In [8]:
# Add 'hCoV-19/' to start of each name
lineage_df['name'] = 'hCoV-19/' + lineage_df['name']
lineage_df

Unnamed: 0,GISAID ID,name,country,travel history,sample date,epiweek,lineage,representative
0,EPI_ISL_420793,hCoV-19/USA/NY_2929/2020,USA,,2020-03-02,10.0,B.1,1
1,EPI_ISL_417142,hCoV-19/USA/WA-S89/2020,USA,,2020-02-29,9.0,A.1,1
2,EPI_ISL_420792,hCoV-19/USA/NH_0008/2020,USA,,2020-03-02,10.0,B.1,1
3,EPI_ISL_420795,hCoV-19/USA/RI_0556/2020,USA,,2020-03-01,10.0,B.1.5,1
4,EPI_ISL_417140,hCoV-19/USA/WA-S87/2020,USA,,2020-03-01,10.0,A.1,1
...,...,...,...,...,...,...,...,...
27758,,hCoV-19/Scotland/CVR04/2020,UK,,2020-03-04,10.0,B,0
27759,,hCoV-19/Scotland/CVR05/2020,UK,,2020-03-04,10.0,B.1,0
27760,,hCoV-19/Scotland/CVR03/2020,UK,,2020-03-01,10.0,B.1,0
27761,,hCoV-19/Scotland/CVR02/2020,UK,,2020-03-02,10.0,B,0


In [9]:
# Function for extracting fasta entry names, 
# and GISAID IDs and taxon names from each entry name

def extract_ids_and_names(fasta_file):
    '''
    Parameters
    ----------
    fasta_file: pathlib.Path
    '''
    
    fp = fasta_file.open('r')
    
    # Store output as a list of tuples,
    # (name, id, sample_date, sequence)
    rows = []
    
    cur_seq = ''
    cur_entry = ''
    
    while True:
        line = fp.readline()
        
        # Beginning of a new entry, or EOF = end of current entry
        if not line or line[0] == '>':
            
            if cur_entry:
                # GISAID sequences are outputted in this format:
                # name | ID | sample_date
                # where name always starts with "hCov-19"
                # The lineage dataframe leaves this out, but we should keep it
                # so we'll add that onto the lineage dataframe later
                chunks = cur_entry.split('|')
                name = chunks[0]
                gisaid_id = chunks[1]
                sample_date = chunks[2]

                # Modify entries here...

                rows.append((name, gisaid_id, sample_date, cur_seq))
                
            
            # If it's the end, then break out
            if not line:
                break
                
            # Reset sequence and name
            cur_seq = ''
            # Extract the name (up to the first whitespace)
            # [1:] excludes the first '>'
            # .split() breaks up the line into chunks separated by whitespace
            # [0] gets the first chunk
            # cur_entry = line[1:].split()[0]
            # Nevermind, the fasta entries sometimes have spaces.....
            # Just rstrip to remove the newline, that should work good enough
            cur_entry = line[1:].rstrip()
            
        # Otherwise add sequence to the current entry
        elif cur_entry:
            cur_seq += re.sub(r'\s+', '', line).strip()
        
    
#     for line in fp:
#         # If it's not an entry, then skip
#         if line[0] != '>':
#             continue
        
#         # Extract the name (up to the first whitespace)
#         # [1:] excludes the first '>'
#         # .split() breaks up the line into chunks separated by whitespace
#         # [0] gets the first chunk
#         line = line[1:].split()[0]
        
#         # GISAID sequences are outputted in this format:
#         # name | ID | sample_date
#         # where name always starts with "hCov-19"
#         # The lineage dataframe leaves this out, but we should keep it
#         # so we'll add that onto the lineage dataframe later
#         chunks = line.split('|')
#         name = chunks[0]
#         gisaid_id = chunks[1]
#         sample_date = chunks[2]
        
#         # Modify entries here...
        
#         rows.append((name, gisaid_id, sample_date))
            
    fp.close()
    
    return rows
    
# Test function
# extract_ids_and_names(fasta_files[0])

In [10]:
for i, ff in enumerate(fasta_files):
    print(ff.name)
    # Testing
    #if i > 0:
    #    break
        
    # Get names and IDs into a dataframe
    ff_df = pd.DataFrame.from_records(extract_ids_and_names(ff), columns=['name', 'gisaid_id', 'sample_date', 'sequence'])

    has_gisaid_id = ~pd.isnull(lineage_df['GISAID ID'])
    
    # Join lineage assignments from lineage_df
    # ff_df['lineage'] = ff_df['name'].map(lineage_df['lineage'])
    ff_df['lineage'] = (
        # Join on taxon names
        ff_df['name'].map(
            pd.Series(
                lineage_df['lineage'].values, 
                index=lineage_df['name'].values
            )
        )
        # Try to fill in missing values by joining on GISAID ID
        .combine_first(
            ff_df['gisaid_id'].map(
                pd.Series(
                    lineage_df['lineage'][has_gisaid_id].values, 
                    index=lineage_df['GISAID ID'][has_gisaid_id].values
                )
            )
        )
    )
    
    # There are gonna be NaNs for the lineage, since the pangolin metadata
    # doesn't have all of them. We'll fill them in later
    ff_df.to_csv(data_path / 'lineage_meta' / (ff.stem + '_lineage.csv'), index=False, columns=['name', 'gisaid_id', 'sample_date', 'lineage'])
    
    # Save taxons without assignments, so we can process them later in pangolin
    to_assign_path = data_path / 'seqs_to_assign' / (ff.stem + '_to_assign.fasta')
    fp_out = to_assign_path.open('w')
    
    for j, row in ff_df.loc[pd.isnull(ff_df['lineage']), :].iterrows():
        fp_out.write('>' + row['name'] + '|' + row['gisaid_id'] + '|' + row['sample_date'] + '\n')
        fp_out.write(row['sequence'] + '\n')
        
    fp_out.close()
ff_df

gisaid_0101-0131.fasta
gisaid_0201-0229.fasta
gisaid_0301-0331.fasta
gisaid_0401-0407.fasta
gisaid_0408-0414.fasta
gisaid_0415-0421.fasta
gisaid_0422-0430.fasta
gisaid_0501-0507.fasta
gisaid_0508-0514.fasta
gisaid_0515-0519.fasta


Unnamed: 0,name,gisaid_id,sample_date,sequence,lineage
0,hCoV-19/USA/LA-SR0295/2020,EPI_ISL_445150,2020-04-08,NNNNNNNATTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,
1,hCoV-19/USA/LA-SR0296/2020,EPI_ISL_445151,2020-04-07,NNTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,
2,hCoV-19/USA/LA-SR0297/2020,EPI_ISL_445152,2020-04-08,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,
3,hCoV-19/USA/LA-SR0298/2020,EPI_ISL_445153,2020-04-08,NNNNAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,
4,hCoV-19/USA/LA-SR0299/2020,EPI_ISL_445154,2020-04-07,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTTTCGA...,
...,...,...,...,...,...
4251,hCoV-19/USA/WA-S694/2020,EPI_ISL_449864,2020-04-08,NNNNNNNNTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,
4252,hCoV-19/USA/WA-S695/2020,EPI_ISL_449865,2020-04-08,NNTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,
4253,hCoV-19/USA/WA-S696/2020,EPI_ISL_449866,2020-04-09,NNNNNNNNNNNNNACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,
4254,hCoV-19/USA/WA-S697/2020,EPI_ISL_449867,2020-04-08,NNTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,


# Rename SAM entries

I messed up and did not extract the full entry name from the raw fastas downloaded from GISAID, since I assumed that there would be no spaces in the names (there are, for "South Africa" and "South Korea", and definitely more that I don't know about). So we have to go into the sam files and change query names.

I'll also have to change the names in the processed fasta files and adjust the code that generates those files, just to be consistent with future processing

In [11]:
import pysam

In [68]:
raw_sam_files = sorted((data_path / 'sam' / 'unfiltered').glob('*.sam'))
raw_sam_files

[PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0101-0131.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0201-0229.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0301-0331.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0401-0407.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0408-0414.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0415-0421.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0422-0430.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0501-0507.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0508-0514.sam'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/sam/unfiltered/gisaid_0515-0519.sam')]

In [69]:
raw_fasta_files = sorted((data_path / 'fasta_raw').glob('*.fasta'))
raw_fasta_files

[PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0101-0131.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0201-0229.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0301-0331.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0401-0407.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0408-0414.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0415-0421.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0422-0430.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0501-0507.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0508-0514.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/fasta_raw/gisaid_0515-0519.fasta')]

In [76]:
for i in range(len(raw_fasta_files)):
    print(raw_fasta_files[i].name)
    # Testing
    #if i > 1:
    #    break
    
    # Get entry names from the raw fasta file
    entry_names = []
    ff = raw_fasta_files[i]
    with ff.open('r') as fp:
        for line in fp:
            if line[0] == '>':
                entry_names.append(line[1:].rstrip())
                
    # Change query names in the sam file, read-by-read
    samfile_path = raw_sam_files[i]
    samfile = pysam.AlignmentFile(str(samfile_path), 'r')
    new_samfile_path = data_path / 'sam' / 'unfiltered_renamed' / samfile_path.name
    new_samfile = pysam.AlignmentFile(str(new_samfile_path), 'w', template=samfile)
    
    k = 0
    for read in samfile.fetch(until_eof=True):
        # print(read.query_name, entry_names[k])
        read.query_name = entry_names[k]
        new_samfile.write(read)
        k += 1
        
    samfile.close()
    new_samfile.close()


gisaid_0101-0131.fasta
gisaid_0201-0229.fasta
gisaid_0301-0331.fasta
gisaid_0401-0407.fasta
gisaid_0408-0414.fasta
gisaid_0415-0421.fasta
gisaid_0422-0430.fasta
gisaid_0501-0507.fasta
gisaid_0508-0514.fasta
gisaid_0515-0519.fasta


# Run sequence preprocessing again

...

# Merge missing sequences in

In [3]:
lineage_files = sorted((data_path / 'lineage_meta').glob('*.csv'))
lineage_files

[PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0101-0131_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0201-0229_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0301-0331_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0401-0407_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0408-0414_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0415-0421_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0422-0430_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0501-0507_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0508-0514_lineage.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/lineage_meta/gisaid_0515-0519_lineage.csv')]

In [4]:
missing_files = sorted((data_path / 'seqs_to_assign').glob('*.csv'))
missing_files

[PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0101-0131_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0201-0229_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0301-0331_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0401-0407_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0408-0414_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0415-0421_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0422-0430_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0501-0507_to_assign.csv'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/lineage_report_0508-0514_to_assign.csv'),
 PosixPath

In [12]:
missing_fasta_files = sorted((data_path / 'seqs_to_assign').glob('*.fasta'))
missing_fasta_files

[PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0101-0131_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0201-0229_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0301-0331_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0401-0407_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0408-0414_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0415-0421_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0422-0430_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0501-0507_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_assign/gisaid_0508-0514_to_assign.fasta'),
 PosixPath('/Volumes/GoogleDrive/My Drive/covid_data/seqs_to_ass

In [22]:
for i in range(len(lineage_files)):
    # Testing
    #if i > 0:
    #    break
        
    print(lineage_files[i].name)
    
    rows = extract_ids_and_names(missing_fasta_files[i])
    gisaid_ids = [row[1] for row in rows]
        
    # Get matching files, load into memory
    lf_df = pd.read_csv(lineage_files[i])
    missing_df = pd.read_csv(missing_files[i])
    
    # Extract the GISAID ID from the fasta entry name
    #missing_df['gisaid_id'] = missing_df['taxon'].str.split('|', expand=True)[1]
    #missing_df = missing_df.set_index('gisaid_id')
    
    # Set the GISAID ID in missing_df to the ids from the fasta file
    missing_df['gisaid_id'] = gisaid_ids
    missing_df = missing_df.set_index('gisaid_id')
    
    # print(missing_df)
    
    # Patch missing values
    lf_df['lineage'] = lf_df['lineage'].combine_first(lf_df['gisaid_id'].map(missing_df['lineage']))
    
    # print(pd.isnull(lf_df['lineage']).sum())
    
    # Save to disk
    lf_df.to_csv(lineage_files[i], index=False)
    

gisaid_0101-0131_lineage.csv
gisaid_0201-0229_lineage.csv
gisaid_0301-0331_lineage.csv
gisaid_0401-0407_lineage.csv
gisaid_0408-0414_lineage.csv
gisaid_0415-0421_lineage.csv
gisaid_0422-0430_lineage.csv
gisaid_0501-0507_lineage.csv
gisaid_0508-0514_lineage.csv
gisaid_0515-0519_lineage.csv
