# Selection of yellow fever reference sequence sublist

Define a list of the reference sequences which we will use for inference and analysis:
- we have a list of reference sequences in 
- we have an envelop percentage distance matrix between some YF variants

We will select the subset of selected sequences as those reference sequences for which are also listed in the distance matrix.

> Note: this is only needed when using the distance matrix for analysis.

# 1. Imports and setup environment

In [None]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [None]:
# Import all required packages
import numpy as np
import pandas as pd
import re

from ecutilities.ipython import nb_setup
from pathlib import Path

# Setup the notebook for development
nb_setup()

from metagentools.cnn_virus.data import FastaFileReader, FastqFileReader, AlnFileReader
from metagentools.core import ProjectFileSystem

Set autoreload mode


# 2. Setup paths to files

In [None]:
pfs = ProjectFileSystem()
pfs.info()

Running linux on local computer
Device's home directory: /home/vtec
Project file structure:
 - Root ........ /home/vtec/projects/bio/metagentools 
 - Data Dir .... /home/vtec/projects/bio/metagentools/data 
 - Notebooks ... /home/vtec/projects/bio/metagentools/nbs


Load the YF variants enveloppe percentage **distance matrix**

In [None]:
p2distances = pfs.data / 'ncbi/other/yf/YF_2023_envelop_percentage_distance_matrix.csv'
dist_mx = pd.read_csv(p2distances, index_col=0)
for col in dist_mx.columns:
    dist_mx[col] = dist_mx[col].str.rstrip('%').astype(float)/100
dist_mx

Unnamed: 0,YFV_Cameroon_2023,YFV_CAR_2019,U21056_senegal_1927,JX898870_senegal_1996,U23574_Senegal_1965,JX898868_Senegal_1995,JX898873_ArD149214_Senegal_2000,JX898874_ArD149194_Senegal_2000,JX898875_Senegal_2000,JX898876_Senegal_2001,...,MH666058_Brazil_2016,MF370547_Brazil_2017,MK583152_Brazil_SaoPaulo_2017,KY885001_Brazil_2017,MW308134_Brazil_2018,MK760660_Netherlands_2018,MK333805_Brazil_IlhaGrande_2018,MW308135_Brazil_2019,MZ604867_Brazil_2019,MZ712143_Brazil_2021
,,,,,,,,,,,,,,,,,,,,,
YFV_Cameroon_2023,0.00,0.04,0.19,0.19,0.19,0.19,0.19,0.19,0.19,0.19,...,0.19,0.18,0.19,0.19,0.19,0.19,0.19,0.19,0.19,0.19
YFV_CAR_2019,0.04,0.00,0.19,0.18,0.18,0.18,0.18,0.18,0.18,0.18,...,0.19,0.19,0.19,0.19,0.19,0.19,0.19,0.19,0.19,0.19
U21056_senegal_1927,0.19,0.19,0.00,0.04,0.04,0.04,0.05,0.05,0.05,0.04,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15
JX898870_senegal_1996,0.19,0.18,0.04,0.00,0.03,0.00,0.03,0.03,0.03,0.03,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15
U23574_Senegal_1965,0.19,0.18,0.04,0.03,0.00,0.03,0.04,0.04,0.04,0.03,...,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MK760660_Netherlands_2018,0.19,0.19,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
MK333805_Brazil_IlhaGrande_2018,0.19,0.19,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
MW308135_Brazil_2019,0.19,0.19,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


Map distance matrix index and accession

In [None]:
distance_accessions = {}
missing=[]
for col in dist_mx.columns:
    regex = re.compile(r'^(?P<accession>\w{1,2}\d*)_(?P<country>.*)_(?P<year>\d\d\d\d)$')
    match = regex.search(col)
    if match: 
        accession = match.group('accession')
        subdict = {
            'accession': accession, 
            'country': match.group('country'), 
            'year': match.group('year'),
            'definition line': col
            } 
        distance_accessions[accession] = subdict
    else:
        missing =+ 1
        print(col, 'could not find the accession')

distance_accessions['U21056']

YFV_Cameroon_2023 could not find the accession
YFV_CAR_2019 could not find the accession
Yellow_fever_YF118_CAR_2018 could not find the accession


{'accession': 'U21056',
 'country': 'senegal',
 'year': '1927',
 'definition line': 'U21056_senegal_1927'}

Compare reference sequences in the matrix and those in our reference sequence files

In [None]:
fa = FastaFileReader(pfs.data/'ncbi/refsequences/yf/yf_2023_yellow_fever.fa')
aln = AlnFileReader(pfs.data/'ncbi/simreads/yf/single_69seq_150bp/single_69seq_150bp.aln')
refseq_metadata = aln.parse_header_reference_sequences()

In [None]:
included, not_included = [], []
for refseqid,v in refseq_metadata.items():
    dist_dict = distance_accessions.get(v['refseq_accession'], None)
    if dist_dict:
        included.append(v['refseq_accession'])
    else:
        not_included.append(refseqid)

print(f"{len(included)} out of 69 accessions mapped with sequence in distance matrix")
print(f"Following {len(not_included)} sequences are not:")
print('\n'.join([f"  - {refseq_metadata[rsid]['refseq_accession']} {refseq_metadata[rsid]['organism']}" for rsid in not_included]))

46 out of 69 accessions mapped with sequence in distance matrix
Following 23 sequences are not:
  - DQ235229 Ethiopia_1961
  - MF405338 Ghana_Hsapiens_1927
  - JX898871 ArD114896_Senegal_1995
  - JX898872 Senegal_Aedes-aegypti_1995
  - DQ118157 Spain_Vaccine_2004
  - JX898879 ArD181676_Senegal_2005
  - JX898881 Senegal_Aedes_luteocephalus_2005
  - JX898880 ArD181564_Senegal_2005
  - MK457701 Nigeria_Hsapiens_2018
  - JF912183 Brazil_Hsapiens_1984
  - KM388817 Venezuela_Guarico_Allouetta_seniculus_2004
  - KM388816 Venezuela_Monagas_Asiniculus_2010
  - MK583166 Brazil_SaoPaulo_Hsapiens_2018
  - MK760665 Netherlands_Hsapiens-from-brazil_2018
  - MF370535 Brazil_Allouatta_sp_2016
  - MF370533 Brazil_Hsapiens_2017
  - MF370530 Brazil_Haemagogus-janthinomys_2017
  - MW960207 Yellow_fever_YF118_CAR_2018
  - KY495641 China_Hsapiens_2016
  - KX268355 China_Hsapiens_2016
  - KY587416 China_Hsapiens_2016
  - MF004383 432429_S4_MF004383
  - MW960207 yp


Create a file with the list of those reference sequences that are also in the distance matrix.

In [None]:
list_included_refeseqs = [f"{meta['refseqid']},{meta['refseq_accession']},{meta['organism']}" for meta in refseq_metadata.values() if meta['refseq_accession'] in included]
p2inclrefseqs = pfs.data / 'ncbi/other/yf/YF_2023-accessions-in-ds-and-distance-matrix.csv'
with open(p2inclrefseqs, 'w') as f:
    f.write('refseqid,refseq_accession,organism\n')
    f.write('\n'.join(list_included_refeseqs))

Load list of sequences into a panda dataframe

In [None]:
df_refseqs = pd.read_csv(p2inclrefseqs, index_col=0)
df_refseqs.head(3)

Unnamed: 0_level_0,refseq_accession,organism
refseqid,Unnamed: 1_level_1,Unnamed: 2_level_1
11089:ncbi:1,AY968064,Angola_1971
11089:ncbi:2,U54798,Ivory_Coast_1982
11089:ncbi:4,AY572535,Gambia_2001


In [None]:
for refseqid in df_refseqs.index:
    accession = df_refseqs.loc[refseqid, 'refseq_accession']
    print(f"{accession:10s} {refseqid:15s} {distance_accessions.get(accession, 'not found')['definition line']}")

AY968064   11089:ncbi:1    AY968064_angola_1971
U54798     11089:ncbi:2    U54798_Ivory_Coast_1982
AY572535   11089:ncbi:4    AY572535_Gambia_2001
U21056     11089:ncbi:6    U21056_senegal_1927
AY968065   11089:ncbi:7    AY968065_Uganda_1948
GQ379163   11089:ncbi:10   GQ379163_Peru_2007
MF289572   11089:ncbi:12   MF289572_Singapore_2017
KU978764   11089:ncbi:13   KU978764_Sudan_1941
JX898878   11089:ncbi:14   JX898878_ArD181250_Senegal_2005
JX898877   11089:ncbi:18   JX898877_ArD181464_Senegal_2005
JX898876   11089:ncbi:19   JX898876_Senegal_2001
KU978765   11089:ncbi:20   KU978765_GuineaBissau_1965
JX898870   11089:ncbi:21   JX898870_senegal_1996
JX898868   11089:ncbi:22   JX898868_Senegal_1995
JX898875   11089:ncbi:23   JX898875_Senegal_2000
JX898874   11089:ncbi:24   JX898874_ArD149194_Senegal_2000
JX898873   11089:ncbi:25   JX898873_ArD149214_Senegal_2000
MK292067   11089:ncbi:26   MK292067_Netherlands_2018
MN958078   11089:ncbi:28   MN958078_Nigeria_2018
JX898869   11089:ncbi:29  

# End of Section