In [1]:
# Import modules
# import tika
from tika import parser # PDF file parser
import pandas as pd
import glob # Unix style pathname pattern expansion
from Bio import SeqIO
from geopy.geocoders import Nominatim # Module to get coordinates

In [2]:
# Function to replace spaces to underscores in strings
def remove_spaces(string):
    return '_'.join(string.split(' '))

# Function to read the GISAID metadata PDF file and filter out unnecessary lines/fields
def read_pdf(path):
    pdf = parser.from_file(path)
    raw_content = pdf['content'].lstrip('\n')
    content_lines = raw_content.split('\n')
    filtered_content = list(filter(lambda x: x != '' and x != 'Virus detail' and x != 'Sample information' 
                          and x != 'Institute information' and x != 'Submitter information' 
                                   and (not x.lower().startswith('updated')) 
                                   and (not x.lower().startswith('note')) 
                                   and (not x.lower().startswith('please note')), content_lines))
    return filtered_content[1:]

In [3]:
# Process text
def fix_content(content):
    
    authors = ''
    modified_content = []
    
    for i in range(len(content)):
        if content[i] == 'Sample ID given by the sample' and content[i+1] == 'provider:':
            content[i] = content[i] + ' ' + content[i+1]
            content.pop(i+1)
        if content[i] == 'Sample ID given by the' and content[i+1] == 'submitting laboratory:':
            content[i] = content[i] + ' ' + content[i+1]
            content.pop(i+1)

        if content[i] == 'Additional location' and content[i+1] == 'information:':
            content[i] = content[i] + ' ' + content[i+1]
            content.pop(i+1)
            
        
        if content[i].startswith('Originating lab'):
            if not content[i+1].startswith('Address'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
            if not content[i+1].startswith('Address'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
        
        if content[i].startswith('Submitting lab'):
            if not content[i+1].startswith('Address'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
            if not content[i+1].startswith('Address'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
                
        
        if content[i].startswith('Sample ID given by the sample provider'):
            if not content[i+1].startswith('Submitting lab'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
                
        
        if content[i].startswith('Additional location'):
            if not content[i + 1].startswith('information') and not content[i+1].startswith('Gender'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
                
        
        if content[i].startswith('Sample ID given by the submitting laboratory'):
            if not content[i+1].startswith('Authors'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
                
                
        if content[i].startswith('Comment'):
            if not content[i + 1].startswith('Originating'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
            if not content[i + 1].startswith('Originating'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i+1)
            
            
                                                   
            

        if content[i].startswith('Authors'):
            authors = content[i]
            if not content[i + 1].startswith('Submitter'):
                authors = authors + ' ' + content[i + 1]
                content.pop(i + 1)
            if not content[i + 1].startswith('Submitter'):
                authors = authors + ' ' + content[i + 1]
                content.pop(i + 1)
            if not content[i + 1].startswith('Submitter'):
                authors = authors + ' ' + content[i + 1] 
                content.pop(i + 1)
            content[i] = authors
             

        if (content[i].startswith('Address:')) and (not content[i+1].startswith('Sample')):
            if not content[i + 1].startswith('Important note'):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
            if (not content[i + 1].startswith('Important note') and (content[i - 1].startswith('Submission'))):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
            if (not content[i + 1].startswith('Important note') and (content[i - 1].startswith('Submission'))):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
            if (not content[i + 1].startswith('Sample') and (content[i-1].startswith('Originating'))):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
            if (not content[i + 1].startswith('Sample') and (content[i-1].startswith('Originating'))):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
            if (not content[i + 1].startswith('Sample') and (content[i-1].startswith('Submitting'))):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
            if (not content[i + 1].startswith('Sample') and (content[i-1].startswith('Submitting'))):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
            if (not content[i + 1].startswith('Sample') and (content[i-1].startswith('Submitting'))):
                content[i] = content[i] + ' ' + content[i + 1]
                content.pop(i + 1)
        
                        
        if content[i].startswith('Important note'):
            break
        modified_content.append(content[i].split(':'))
    
    return modified_content

In [4]:
# Function to create a pandas dataframe 
def create_dataframe(content):
    data_dict = {}
    for i in range(len(content)):

        content[i][0] = remove_spaces(content[i][0])
        content[i][1] = remove_spaces(content[i][1]).lstrip('_')

        if content[i][0] == 'Address':
            if content[i-1][0] == 'Originating_lab':
                content[i][0] = 'Originating_lab_address'
            if content[i-1][0] == 'Submitting_lab':
                content[i][0] = 'Submitting_lab_address'
            if content[i-1][0] == 'Submission_Date':
                content[i][0] = 'Submitter_address'

        if content[i][1]:
            data_dict[content[i][0].lower()] = content[i][1]
        else:
            data_dict[content[i][0].lower()] = 'NA'
    
    df = pd.DataFrame([data_dict], columns=data_dict.keys())
    return df

In [5]:
# Create a pandas dataframe object for each GISAID entry
for filename in glob.glob('*.pdf'):
    print(filename)
    pdf = read_pdf(filename)
    content = fix_content(pdf)
    df = create_dataframe(content)

EPI_ISL_402119.pdf
EPI_ISL_402120.pdf
EPI_ISL_402121.pdf
EPI_ISL_402123.pdf
EPI_ISL_402124.pdf
EPI_ISL_402125.pdf
EPI_ISL_402126.pdf
EPI_ISL_402127.pdf
EPI_ISL_402128.pdf
EPI_ISL_402129.pdf
EPI_ISL_402130.pdf
EPI_ISL_402131.pdf
EPI_ISL_402132.pdf
EPI_ISL_403928.pdf
EPI_ISL_403929.pdf
EPI_ISL_403930.pdf
EPI_ISL_403931.pdf
EPI_ISL_403932.pdf
EPI_ISL_403933.pdf
EPI_ISL_403934.pdf
EPI_ISL_403935.pdf
EPI_ISL_403936.pdf
EPI_ISL_403937.pdf
EPI_ISL_403962.pdf
EPI_ISL_403963.pdf
EPI_ISL_404227.pdf
EPI_ISL_404228.pdf
EPI_ISL_404253.pdf
EPI_ISL_404895.pdf
EPI_ISL_405839.pdf
EPI_ISL_406030.pdf
EPI_ISL_406031.pdf
EPI_ISL_406034.pdf
EPI_ISL_406036.pdf
EPI_ISL_406223.pdf
EPI_ISL_406531.pdf
EPI_ISL_406533.pdf
EPI_ISL_406534.pdf
EPI_ISL_406535.pdf
EPI_ISL_406536.pdf
EPI_ISL_406538.pdf
EPI_ISL_406592.pdf
EPI_ISL_406593.pdf
EPI_ISL_406594.pdf
EPI_ISL_406595.pdf
EPI_ISL_406596.pdf
EPI_ISL_406597.pdf
EPI_ISL_406716.pdf
EPI_ISL_406717.pdf
EPI_ISL_406798.pdf
EPI_ISL_406799.pdf
EPI_ISL_406800.pdf
EPI_ISL_4068

In [6]:
# Create a list of dataframes
df_list = []
for filename in glob.glob('*.pdf'):
    pdf = read_pdf(filename)
    content = fix_content(pdf)
    df = create_dataframe(content)
    df_list.append(df)

In [7]:
# Concatenate dataframes into 1 dataframe
data = pd.concat(df_list, sort=False)
# Add a sequence column
data['sequence'] = 'NA'

In [8]:
# Add sequence data to the dataframe
for fasta in glob.glob('*.fasta'):
    fh = open(fasta)
    seq = SeqIO.read(fh, 'fasta')
    fh.close()
    accession_id = seq.description.split('|')[1]
    data.loc[data.accession_id == accession_id, 'sequence'] = str(seq.seq)

In [9]:
# Get unique locations
locations = (data['location'].unique())
# Create a coordinate dictionary and initiate geolocator object
coordinates = {}
geolocator = Nominatim(user_agent='covid-locations')

# Get coordinates for each location
# Retrieve data in chucnk to avoid errors if there are too many requests
# First 50 locations
for location in locations[:50]:
    precise_location = ' '.join(location.split('_/_')[-1].split('_'))
    loc_coordinates = geolocator.geocode(precise_location)
    print(location)
    coordinates[location] = [loc_coordinates.latitude, loc_coordinates.longitude]

Asia_/_China_/_Hubei_/_Wuhan
Asia_/_China
Asia_/_Japan_/_Kanagawa
Asia_/_China_/_Yunnan_/_Pu'er
Asia_/_China_/_Guandong_/_Shenzhen
Asia_/_China_/_Guangdong_/_Shenzhen
Asia_/_China_/_Guangdong_/_Zhuhai
Asia_/_Thailand_/_Nonthaburi
Asia_/_China_/_Zhejiang
North_America_/_USA_/_Illinois_/_Chicago
North_America_/_USA_/_Washington_/_Snohomish_County
Asia_/_Taiwan_/_Kaohsiung
North_America_/_USA_/_California_/_Los_Angeles
North_America_/_USA_/_California_/_Orange_County
North_America_/_USA_/_Arizona_/_Phoenix
Asia_/_China_/_Guangdong_/_Guangzhou
Asia_/_China_/_Guangdong_/_Foshan
Asia_/_China_/_Guangdong
Europe_/_France_/_Ile-de-France_/_Paris
Asia_/_China_/_Wuhan
Oceania_/_Australia_/_Victoria_/_Clayton
Europe_/_Germany_/_Bavaria_/_Munich
Europe_/_Italy_/_Rome
Asia_/_China_/_Zhejiang_/_Hangzhou
Asia_/_Singapore
Europe_/_England
Europe_/_Finland_/_Lapland
Asia_/_Japan_/_Aichi
Asia_/_South_Korea_/_Gyeonggi-do
North_America_/_USA_/_Washington
Oceania_/_Australia_/_New_South_Wales_/_Sydney
Ocean

In [11]:
# 50 - 99
for location in locations[50:100]:
    precise_location = ' '.join(location.split('_/_')[-1].split('_'))
    loc_coordinates = geolocator.geocode(precise_location)
    print(location)
    coordinates[location] = [loc_coordinates.latitude, loc_coordinates.longitude]

Asia_/_Hong_Kong
North_America_/_USA_/_Massachusetts
North_America_/_USA_/_Illinois
Asia_/_Nepal_/_Kathmandu
Asia_/_Philippines
Europe_/_France_/_Rhone-Alpes_/_Contamines
Asia_/_Malaysia
Asia_/_Japan_/_Nara
Asia_/_Japan_/_Osaka
Asia_/_China_/_Guangxi
Asia_/_China_/_Fujian
Asia_/_Cambodia_/_Sihanoukville
Asia_/_Taiwan_/_Taoyuan
Asia_/_South_Korea
Asia_/_China_/_Jiangsu
Europe_/_Sweden
North_America_/_USA_/_Texas
Asia_/_China_/_HuaShang
Asia_/_China_/_Anhui_/_Hefei
Asia_/_China_/_Shenzhen
Asia_/_China_/_Shandong_/_Linyi_county
Asia_/_China_/_Shandong_/_Linyi
Asia_/_China_/_Hubei_/_Jingzhou
North_America_/_USA_/_California_/_Solano
Asia_/_South_Korea_/Seoul
Asia_/_South_Korea/_Seoul
Asia_/_South_Korea_/_Seoul
Asia_/_South_Korea_/_Chungcheongnam-do
Europe_/_Germany_/_Baden-Wuerttemberg
South_America_/_Brazil_/_Sao_Paulo_/_Sao_Paulo
North_America_/_Canada_/_British_Columbia
Asia_/_China_/_Guangzhou
Asia_/_Japan
Europe_/_Finland_/_Helsinki
North_America_/_Mexico_/_Mexico_City
Europe_/_Italy_

In [12]:
# 100 - N
for location in locations[100:]:
    precise_location = ' '.join(location.split('_/_')[-1].split('_'))
    loc_coordinates = geolocator.geocode(precise_location)
    print(location)
    coordinates[location] = [loc_coordinates.latitude, loc_coordinates.longitude]

Europe_/_United_Kingdom_/_Wales
North_America_/_USA_/_California_/_Sonoma_County
North_America_/_USA_/_California_/_Solano_County
Europe_/_Netherlands_/_Andel
Europe_/_Netherlands_/_Berlicum
Europe_/_Netherlands_/_Blaricum
Europe_/_Netherlands_/_Coevorden
Europe_/_Netherlands_/_Dalen
Europe_/_Netherlands_/_Delft
Europe_/_Netherlands_/_Diemen
Europe_/_Netherlands_/_Eindhoven
Europe_/_Netherlands_/_Haarlem
Europe_/_Netherlands_/_Hardinxveld_Giessendam
Europe_/_Netherlands_/_Helmond
Europe_/_Netherlands_/_Houten
Europe_/_Netherlands_/_Loon_op_zand
Europe_/_Netherlands_/_Naarden
Europe_/_Netherlands_/_Nieuwendijk
Europe_/_Netherlands_/_Nootdorp
Europe_/_Netherlands_/_Oisterwijk
Europe_/_Netherlands_/_Oss
Europe_/_Netherlands_/_Rotterdam
Europe_/_Netherlands_/_Tilburg
Europe_/_Netherlands_/_Utrecht
Europe_/_Netherlands_/_Zeewolde
Europe_/_Luxemburg
Oceania_/_Australia_/_NSW_/_Sydney
North_America_/_USA
Europe_/_Portugal


In [13]:
# Add coordinate data to the dataframe
for location in coordinates:
    data.loc[data.location == location, 'latitude'] = coordinates[location][0]
    data.loc[data.location == location, 'longitude'] = coordinates[location][1]

In [14]:
data

Unnamed: 0,virus_name,accession_id,type,passage_details/history,collection_date,location,host,additional_location_information,gender,patient_age,...,submitting_lab_address,sample_id_given_by_the_submitting_laboratory,authors,submitter,submission_date,submitter_address,comment,sequence,latitude,longitude
0,BetaCoV/Wuhan/IVDC-HB-01/2019,EPI_ISL_402119,betacoronavirus,"Virus_Isolate,_Passage_1",2019-12-30,Asia_/_China_/_Hubei_/_Wuhan,Human,,Female,49,...,"155_Changbai_Road,_Changping_District,_Beijing...",,Wenjie_TanXiang_ZhaoWenling_WangXuejun_MaYongz...,Wenjie_Tan,2020-01-10,National_Institute_for_Viral_Disease_Control_a...,,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,30.595105,114.299935
0,BetaCoV/Wuhan/IVDC-HB-04/2020,EPI_ISL_402120,betacoronavirus,Original,2020-01-01,Asia_/_China_/_Hubei_/_Wuhan,Human,,Male,61,...,"155_Changbai_Road,_Changping_District,_Beijing...",,Wenjie_TanXiang_ZhaoWenling_WangXuejun_MaYongz...,Wenjie_Tan,2020-01-11,National_Institute_for_Viral_Disease_Control_a...,,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,30.595105,114.299935
0,BetaCoV/Wuhan/IVDC-HB-05/2019,EPI_ISL_402121,betacoronavirus,Original,2019-12-30,Asia_/_China_/_Hubei_/_Wuhan,Human,,Male,32,...,"155_Changbai_Road,_Changping_District,_Beijing...",,Wenjie_TanXuejun_MaXiang_ZhaoWenling_WangYongz...,Wenjie_Tan,2020-01-10,National_Institute_for_Viral_Disease_Control_a...,,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,30.595105,114.299935
0,BetaCoV/Wuhan/IPBCAMS-WH-01/2019,EPI_ISL_402123,betacoronavirus,Original,2019-12-24,Asia_/_China_/_Hubei_/_Wuhan,Human,,Male,65,...,"No._9_Dong_Dan_San_Tiao,_Dong_Cheng_District,_...",BetaCoV/Wuhan/IPBCAMS-WH-01/2019,"Lili_Ren,_Jianwei_Wang,_Qi_Jin,_Zichun_Xiang,_...","Lili_Ren,_Jianwei_Wang,_Qi_Jin,_Zichun_Xiang,_...",2020-01-11,"No._9_Dong_Dan_San_Tiao,_Dong_Cheng_District,_...",,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,30.595105,114.299935
0,BetaCoV/Wuhan/WIV04/2019,EPI_ISL_402124,betacoronavirus,Original,2019-12-30,Asia_/_China_/_Hubei_/_Wuhan,Human,,Female,49,...,"44_Xiao_Hong_Shan,_Wuhan,_Hubei_430071",WIV04,"Peng_Zhou,_Xing-Lou_Yang,_Ding-Yu_Zhang,_Lei_Z...","Wuhan_Institute_of_Virology,_Chineses_Academy_...",2020-01-11,"44_Xiao_Hong_Shan,_Wuhan,_Hubei_430071",,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,30.595105,114.299935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,hCoV-19/China/WF0023/2020,EPI_ISL_413752,betacoronavirus,Original,2020-02,Asia_/_China,Human,,Unknown,Unknown,...,"Department_of_Microbiology,_Weifang_Center_for...",,"Qing_Nie,_Wei_Chen,_Dehui_Liu,_Yingying_Chen",Xingguang_Li,2020-03-09,Hubei_Engineering_Research_Center_of_Viral_Vec...,Multiple_gaps_relative_to_reference_sequence,AAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA...,35.000074,104.999927
0,hCoV-19/China/WF0024/2020,EPI_ISL_413753,betacoronavirus,Original,2020-02,Asia_/_China,Human,,Unknown,Unknown,...,"Department_of_Microbiology,_Weifang_Center_for...",,"Qing_Nie,_Wei_Chen,_Dehui_Liu,_Yingying_Chen",Xingguang_Li,2020-03-09,Hubei_Engineering_Research_Center_of_Viral_Vec...,Gap_relative_to_reference_sequence,TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGAT...,35.000074,104.999927
0,hCoV-19/China/WF0026/2020,EPI_ISL_413761,betacoronavirus,Original,2020-02,Asia_/_China,Human,,Unknown,Unknown,...,"Department_of_Microbiology,_Weifang_Center_for...",,"Qing_Nie,_Wei_Chen,_Dehui_Liu,_Yingying_Chen",Xingguang_Li,2020-03-09,Hubei_Engineering_Research_Center_of_Viral_Vec...,,ACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATC...,35.000074,104.999927
0,hCoV-19/China/WF0028/2020,EPI_ISL_413791,betacoronavirus,Original,2020-02,Asia_/_China,Human,,Unknown,Unknown,...,"Department_of_Microbiology,_Weifang_Center_for...",,"Qing_Nie,_Wei_Chen,_Dehui_Liu,_Yingying_Chen",Xingguang_Li,2020-03-09,Hubei_Engineering_Research_Center_of_Viral_Vec...,,ATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGA...,35.000074,104.999927


In [15]:
# Write to file
data.to_excel('gisaid_covid_19.xlsx', index=False)