In [4]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import collections
from datetime import datetime
pd.set_option('display.max_rows', 20)

### Read in Metadata

In [5]:
df = pd.read_csv("D:/Workspace/COVID-19-data-linked/metadata_5_24_2020.tsv", sep="\t")
# Filter by conditions
df = df[(df["host"] == 'Human') & (df["age"] != "?") & ((df["sex"] == "Male") | (df["sex"] == "Female"))]
# Only want a few columns
df = df[['gisaid_epi_isl', "country", "age", "sex", "date_submitted"]]
# Preview
df.to_csv("D:/Workspace/COVID-19-data-linked/metadata_5_24_2020_filtered.tsv", sep="\t", index=False)
df

Unnamed: 0,gisaid_epi_isl,country,age,sex,date_submitted
0,EPI_ISL_418241,Algeria,28,Female,2020-03-29
1,EPI_ISL_418242,Algeria,87,Male,2020-03-29
2,EPI_ISL_420037,Algeria,41,Male,2020-04-04
3,EPI_ISL_413485,China,58,Male,2020-03-05
4,EPI_ISL_420600,Argentina,51,Male,2020-04-06
5,EPI_ISL_420599,Argentina,47,Male,2020-04-06
6,EPI_ISL_420598,Argentina,73,Male,2020-04-06
7,EPI_ISL_430793,Argentina,40,Male,2020-04-27
8,EPI_ISL_430794,Argentina,40,Male,2020-04-27
9,EPI_ISL_430795,Argentina,57,Female,2020-04-27


### Convert meta_data to dictionary

In [6]:
meta_data = collections.defaultdict(list)
for i, (gisaid, *rest) in df.iterrows():
    meta_data[gisaid].extend(rest) 
    
# Example of dataset:
for i, val in enumerate(meta_data.items()):
    print(val)
    if i == 10: break

('EPI_ISL_418241', ['Algeria', '28', 'Female', '2020-03-29'])
('EPI_ISL_418242', ['Algeria', '87', 'Male', '2020-03-29'])
('EPI_ISL_420037', ['Algeria', '41', 'Male', '2020-04-04'])
('EPI_ISL_413485', ['China', '58', 'Male', '2020-03-05'])
('EPI_ISL_420600', ['Argentina', '51', 'Male', '2020-04-06'])
('EPI_ISL_420599', ['Argentina', '47', 'Male', '2020-04-06'])
('EPI_ISL_420598', ['Argentina', '73', 'Male', '2020-04-06'])
('EPI_ISL_430793', ['Argentina', '40', 'Male', '2020-04-27'])
('EPI_ISL_430794', ['Argentina', '40', 'Male', '2020-04-27'])
('EPI_ISL_430795', ['Argentina', '57', 'Female', '2020-04-27'])
('EPI_ISL_430796', ['Argentina', '30', 'Male', '2020-04-27'])


### Filter fasta file based those available in meta_data

In [7]:
records = []
for i, record in enumerate(SeqIO.parse("D:\Workspace\COVID-19-data-linked\gisaid_hcov-19_2020_05_24_18.fasta", "fasta")):
    header = record.id.split("|")
    if len(header) > 1 and header[1] in meta_data:
        records.append(record)

### Save new fasta file

In [8]:
SeqIO.write(records, f"D:/Workspace/COVID-19-data-linked/gisaid_cov2020_sequences_filtered_{len(records)}.fasta", "fasta")

10845

# Make trees with correct id

In [9]:
records = []
for i, record in enumerate(SeqIO.parse("D:\Workspace\COVID-19-data-linked\gisaid_cov2020_sequences_filtered_8312.mafft", "fasta")):
    record.id = record.id.split("|")[1]
    record.description = ""
    record.name = ""
    records.append(record)

In [10]:
SeqIO.write(records, f"D:\Workspace\COVID-19-data-linked\gisaid_cov2020_sequences_filtered_8312_2.mafft", "fasta")

8312