In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from Bio import SeqIO
from Bio import AlignIO
import os
from IPython.display import clear_output

# Confirming 5 metadata csv files

In [2]:
%ls AZ_202*_sequences_*.csv

AZ_2020_sequences_20240624_2432496.csv  AZ_2023_sequences_20240624_5090939.csv
AZ_2021_sequences_20240624_2015415.csv  AZ_2024_sequences_20240624_6787182.csv
AZ_2022_sequences_20240624_7946017.csv


In [3]:
%ls TX_202*_sequences_*.csv

TX_2020_sequences_20240624_3702594.csv  TX_2023_sequences_20240624_5486482.csv
TX_2021_sequences_20240624_7191386.csv  TX_2024_sequences_20240624_5024554.csv
TX_2022_sequences_20240624_1306680.csv


In [4]:
%ls US_202*_sequences_*.csv

US_2020_sequences_20240624_2460545.csv  US_2023_sequences_2023.csv
US_2021_sequences_20240624_8382214.csv  US_2024_sequences_2024.csv
US_2022_sequences_20240624_6470998.csv


## Extracting accession numbers for States isolates

In [8]:
!awk -F, 'FNR>1{print $1}' AZ_202*_sequences_*.csv > AZ_SARS-CoV-2_list.txt

In [9]:
!awk -F, 'FNR>1{print $1}' TX_202*_sequences_*.csv > TX_SARS-CoV-2_list.txt

In [10]:
!awk -F, 'FNR>1{print $1}' US_202*_sequences_*.csv > US_SARS-CoV-2_list.txt

In [11]:
!ls

[34mAZ[m[m
AZ_2020_sequences_20240624_2432496.csv
AZ_2021_sequences_20240624_2015415.csv
AZ_2022_sequences_20240624_7946017.csv
AZ_2023_sequences_20240624_5090939.csv
AZ_2024_sequences_20240624_6787182.csv
AZ_SARS-CoV-2_list.txt
Align_ordered.npy
Align_ordered_AZ.npy
Align_ordered_TX.npy
Align_ordered_US.npy
Alignment_HammingDist_Li Zhou.ipynb
COVID-19Casesanddeaths.csv
COVID19_Daily_DeathsDist_US.pdf
COVID19_DeathDist.pdf
COVID19_InciDist.pdf
COVID19_IncidenceDistance_US.pdf
Covid19_Deaths.pdf
Covid19_Dist.pdf
Covid19_GenDist.pdf
Covid19_Incidence.pdf
[34mCovidGen[m[m
Data_parsing1-Copy1.ipynb
Data_parsing1-Copy2.ipynb
Data_parsing1.ipynb
Data_parsing2.ipynb
Dates.csv
Dates_FL.csv
Distance_Data_US.eps
Distance_Data_US.png
Extra0.R
Extra1.R
Extra2.R
[34mFL[m[m
FL_2020_sequences_20240621_2996593.csv
FL_2021_sequences_20240621_9841249.csv
FL_2022_sequences_20240621_1148050.csv
FL_2023_sequences_20240621_1775388.csv
FL_2024_sequences_20240621

In [13]:
!cat AZ_SARS-CoV-2_list.txt

MZ909064.1
MZ907698.1
MZ518182.1
MZ906164.1
MZ505795.1
MZ462341.1
MZ917792.1
MZ907189.1
MZ915074.1
MZ433940.1
MZ462305.1
MZ906798.1
OK167013.1
MZ905724.1
MW634102.1
MZ907053.1
MZ543423.1
MW679003.1
MZ433532.1
ON456732.1
MZ472616.1
MZ905850.1
MZ473151.1
MZ910676.1
MZ473158.1
MZ433887.1
MZ906800.1
MZ543370.1
MZ918221.1
MZ472623.1
MZ906495.1
MZ905718.1
MZ504026.1
MZ505941.1
MZ433929.1
MZ905827.1
MZ906509.1
MZ909029.1
MW634121.1
MZ472622.1
MZ907296.1
MZ504273.1
MZ462309.1
MZ472666.1
MZ907093.1
OK465858.1
MZ544889.1
MZ908920.1
MZ905670.1
MZ544900.1
MZ911630.1
MZ473199.1
ON477723.1
MZ906099.1
OP249312.1
MZ518148.1
MZ433430.1
MZ503980.1
MZ433623.1
MZ543359.1
MZ473147.1
MZ433865.1
MZ503969.1
OP249402.1
MZ905721.1
MZ473131.1
ON456834.1
MZ433910.1
MZ433934.1
MW190450.1
ON456846.1
MZ905646.1
MZ918348.1
MZ906740.1
MZ518174.1
MZ905764.1
MW485392.1
MZ910031.1
MZ518191.1
OP248950.1
MZ462334.1
MZ905667.1
MZ433424.1
ON47

In [14]:
!cat TX_SARS-CoV-2_list.txt

MW836859.1
MW809196.1
MW809219.1
MW816562.1
MW738802.1
MW010235.1
MW763203.1
OM181289.1
MW736130.1
MW738786.1
OM181145.1
MW752921.1
MW897402.1
OM181205.1
OM181314.1
MW263336.1
OM181166.1
ON891360.1
MW799629.1
MW836892.1
MW836906.1
MW799636.1
MW815790.1
OM000284.1
MW763234.1
MW809228.1
MW419999.1
MW897391.1
OM181171.1
MW846049.1
MW550367.1
MW763235.1
MW826286.1
MW736093.1
MW846005.1
MW865476.1
MW846076.1
ON893961.1
MW773120.1
MW064449.1
OM181336.1
OM181352.1
MW752907.1
MW752956.1
MW809195.1
MW799767.1
OM181248.1
ON893956.1
MW521792.1
MW424842.1
MW738842.1
OM181342.1
MW826315.1
MW736110.1
MW846014.1
OM181218.1
MW863226.1
MW809235.1
MW420035.1
MW566557.1
OM181129.1
MW730963.1
MW826334.1
MW865461.1
OM181238.1
MW885939.1
MW863185.1
MW809258.1
MW752985.1
MW738844.1
MW846011.1
ON891370.1
OM181236.1
MW799597.1
MW738820.1
MW809273.1
MW093478.1
PP258414.1
ON891396.1
OM181372.1
MW846043.1
OM181358.1
MW738813.1
MZ25

In [15]:
!cat US_SARS-CoV-2_list.txt

PP603348.1
OL844281.1
OR311346.1
OQ315940.1
MW288261.1
ON134668.1
OL901545.1
ON358925.1
MW205984.1
OR938132.1
MW420445.1
MT873409.1
MW205982.1
MZ377862.1
MW738078.1
OR318709.1
MW190825.1
MT632631.1
MZ915424.1
MW763207.1
MZ471565.1
MZ456036.1
MW474203.1
MW495880.1
MW863252.1
MW483166.1
OP725881.1
OL756988.1
MZ391021.1
OR905268.1
OQ316212.1
OR316808.1
OR678723.1
MT412241.1
OL411659.1
PP141754.1
ON292984.1
OQ928466.1
MZ410018.1
MW460575.1
MZ028707.1
MT833986.1
ON292940.1
MZ377844.1
MW389605.1
OK623288.1
PP616559.1
MT450904.1
OM840233.1
OR309426.1
OR313636.1
MW974143.1
OK546947.1
MT520372.1
OK547734.1
OL698851.1
MW838394.2
OK546971.1
OL551607.1
MT791937.1
OQ316085.1
OK650174.1
OL467981.1
OR312169.1
OK657704.1
OL756400.1
OR300867.1
PP645492.1
MT506541.1
ON359357.1
OR310177.1
OR938114.1
ON194239.1
MW279429.1
OK653423.1
MW191275.1
OR306436.1
OR307936.1
PP603305.1
MW930477.1
MW035987.1
OR308613.1
MW341798.1
OR31

PP129245.1
PP129246.1
PP129247.1
PP129248.1
PP129250.1
PP129251.1
PP129252.1
PP129253.1
PP129255.1
PP129256.1
PP129258.1
PP129259.1
PP129260.1
PP129261.1
PP129262.1
PP129263.1
PP129264.1
PP129265.1
PP129266.1
PP129267.1
PP129268.1
PP129269.1
PP129270.1
PP129272.1
PP129273.1
PP129275.1
PP129276.1
PP129277.1
PP129278.1
PP129279.1
PP129282.1
PP129283.1
PP129284.1
PP129285.1
PP129286.1
PP129287.1
PP129289.1
PP129290.1
PP134812.1
PP106568.1
PP106570.1
PP111603.1
PP111628.1
PP846924.1
PP796949.1
PP796950.1
PP796951.1
PP796953.1
PP796957.1
PP796959.1
PP796965.1
PP796969.1
PP796970.1
PP796981.1
PP799530.1
PP740745.1
PP663348.1
PP490822.1
PP490953.1
PP490989.1
PP490997.1
PP491057.1
PP485300.1
PP485335.1
PP485405.1
PP485436.1
PP485514.1
PP466734.1
PP317770.1
PP315481.1
PP315567.1
PP315609.1
PP315617.1
PP315620.1
PP315633.1
PP316398.1
PP280609.1
PP280618.1
PP266523.1
PP266578.1
PP266579.1
PP236689.1
PP236691.1
PP23

## Download and install NCBI-datasets

In [22]:
conda install conda-forge::ncbi-datasets-cli

Retrieving notices: ...working... done
Channels:
 - conda-forge
 - bioconda
 - defaults
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/miniconda3/envs/ds

  added / updated specs:
    - conda-forge::ncbi-datasets-cli


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ncbi-datasets-cli-16.22.0  |       h694c41f_0        31.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:        31.1 MB

The following packages will be UPDATED:

  ncbi-datasets-cli                      16.21.0-h694c41f_0 --> 16.22.0-h694c41f_0 



Downloading and Extracting Packages:
                                                                                
Preparing transaction: done
  environment location: /Users/jordycevallos/.conda/enviro

In [None]:
# datasets download virus genome accession \
#execute in terminal
#pip install conda-forge::ncbi-datasets-cli
#datasets download virus genome accession --inputfile IL_SARS-CoV-2_list.txt --include cds
#datasets --version

In [None]:
#%cd ~/IMSI/Proj1/ncbi_dataset/data

/home/s/saumya2/IMSI/Proj1/ncbi_dataset/data


## Extract and verify coding region from the genome database

In [16]:
!awk '/surface glycoprotein/{flag=1} flag==1 && /^>/ && !/surface glycoprotein/{flag=0} flag==1{print}' < cds_TX.fna > cds_S_TX.fna

In [17]:
!awk '/>/{if(NR>1){print m,nam,ln}; m++; nam=$1;ln=0} !/>/{ln+=length($0)} END {print m,nam,ln}' < cds_S_TX.fna

1 >OR016361.1:21552-25361 3810
2 >OR075662.1:21504-25313 3810
3 >OR075797.1:21504-25313 3810
4 >OR186793.1:21504-25310 3807
5 >OR186845.1:21504-25313 3810
6 >OR226164.1:21504-25313 3810
7 >OR268393.1:21504-25313 3810
8 >OR364341.1:21504-25313 3810
9 >OR378815.1:21484-25293 3810
10 >OR439987.1:21504-25313 3810
11 >OR440013.1:21504-25313 3810
12 >OR440024.1:21504-25313 3810
13 >OR440044.1:21504-25313 3810
14 >OR465474.1:21504-25313 3810
15 >OR465778.1:21504-25313 3810
16 >OR465878.1:21504-25313 3810
17 >OR495047.1:21504-25313 3810
18 >OR495135.1:21504-25313 3810
19 >OR495255.1:21504-25313 3810
20 >OR517510.1:21495-25301 3807
21 >OR523084.1:21515-25324 3810
22 >OR558489.1:21504-25313 3810
23 >OR558498.1:21495-25304 3810
24 >OR558536.1:21504-25313 3810
25 >OR558551.1:21504-25313 3810
26 >OR558571.1:21504-25313 3810
27 >OR558577.1:21504-25313 3810
28 >OR558598.1:21504-25313 3810
29 >OR558634.1:21504-25313 3810
30 >OR558677.1:21504-25313 3810
31 >OR558685.1:21504-25313 3810
32 >OR558693.1:21

364 >PP436142.1:21504-25313 3810
365 >PP436143.1:21504-25313 3810
366 >PP436146.1:21504-25313 3810
367 >PP436148.1:21504-25313 3810
368 >PP436164.1:21504-25313 3810
369 >PP436173.1:21504-25313 3810
370 >PP436176.1:21504-25313 3810
371 >PP436198.1:21504-25313 3810
372 >PP436207.1:21504-25313 3810
373 >PP436221.1:21504-25313 3810
374 >PP436236.1:21504-25313 3810
375 >PP436237.1:21504-25313 3810
376 >PP436240.1:21504-25313 3810
377 >PP436245.1:21504-25313 3810
378 >PP436276.1:21504-25313 3810
379 >PP447023.1:21515-25324 3810
380 >PP447032.1:21482-25291 3810
381 >PP447033.1:21515-25324 3810
382 >PP447034.1:21515-25324 3810
383 >PP456441.1:21419-25228 3810
384 >PP456582.1:21452-25261 3810
385 >PP456595.1:21452-25258 3807
386 >PP470314.1:21504-25313 3810
387 >PP470389.1:21504-25313 3810
388 >PP470425.1:21504-25313 3810
389 >PP470427.1:21504-25313 3810
390 >PP470471.1:21504-25313 3810
391 >PP470525.1:21501-25310 3810
392 >PP470528.1:21504-25313 3810
393 >PP470540.1:21495-25304 3810
394 >PP494

676 >OQ399406.1:21498-25304 3807
677 >OQ399454.1:21504-25310 3807
678 >OQ399464.1:21504-25310 3807
679 >OQ399484.1:21504-25310 3807
680 >OQ399489.1:21504-25310 3807
681 >OQ399508.1:21504-25316 3813
682 >OQ399566.1:21504-25310 3807
683 >OQ399585.1:21504-25310 3807
684 >OQ399629.1:21504-25310 3807
685 >OQ399638.1:21504-25310 3807
686 >OQ399665.1:21504-25310 3807
687 >OQ399716.1:21504-25310 3807
688 >OQ399781.1:21498-25301 3804
689 >OQ399789.1:21501-25307 3807
690 >OQ399790.1:21504-25310 3807
691 >OQ399838.1:21504-25310 3807
692 >OQ400182.1:21504-25310 3807
693 >OQ400184.1:21504-25316 3813
694 >OQ400391.1:21504-25310 3807
695 >OQ400406.1:21504-25307 3804
696 >OQ400658.1:21504-25307 3804
697 >OQ400672.1:21504-25310 3807
698 >OQ400698.1:21504-25307 3804
699 >OQ400739.1:21501-25307 3807
700 >OQ400743.1:21504-25310 3807
701 >OQ400754.1:21504-25310 3807
702 >OQ403178.1:21452-25258 3807
703 >OQ403179.1:21515-25324 3810
704 >OQ403197.1:21515-25321 3807
705 >OQ403198.1:21515-25321 3807
706 >OQ419

1118 >OQ266259.1:21504-25310 3807
1119 >OQ266260.1:21504-25310 3807
1120 >OQ266263.1:21504-25310 3807
1121 >OQ266264.1:21504-25307 3804
1122 >OQ266277.1:21495-25301 3807
1123 >OQ274640.1:21504-25310 3807
1124 >OQ274646.1:21495-25301 3807
1125 >OQ274660.1:21504-25310 3807
1126 >OQ274667.1:21504-25310 3807
1127 >OQ274676.1:21504-25310 3807
1128 >OQ274679.1:21495-25301 3807
1129 >OQ274702.1:21504-25310 3807
1130 >OQ274703.1:21504-25310 3807
1131 >OQ274716.1:21495-25301 3807
1132 >OQ274763.1:21504-25310 3807
1133 >OQ274780.1:21504-25307 3804
1134 >OQ274791.1:21504-25310 3807
1135 >OQ274808.1:21504-25310 3807
1136 >OQ274813.1:21504-25310 3807
1137 >OQ284641.1:21554-25360 3807
1138 >OQ284642.1:21552-25358 3807
1139 >OQ284696.1:21554-25366 3813
1140 >OQ284801.1:21552-25364 3813
1141 >OQ284805.1:21545-25351 3807
1142 >OQ284850.1:21554-25360 3807
1143 >OQ284857.1:21552-25355 3804
1144 >OQ284890.1:21552-25355 3804
1145 >OQ284902.1:21554-25360 3807
1146 >OQ284908.1:21552-25358 3807
1147 >OQ284910

1562 >ON893919.1:21461-25282 3822
1563 >ON893921.1:21491-25312 3822
1564 >ON893922.1:21524-25345 3822
1565 >ON893925.1:21491-25312 3822
1566 >ON893926.1:21524-25345 3822
1567 >ON893930.1:21524-25345 3822
1568 >ON893934.1:21524-25345 3822
1569 >ON893936.1:21524-25345 3822
1570 >ON893937.1:21461-25282 3822
1571 >ON893942.1:21524-25345 3822
1572 >ON893943.1:21491-25312 3822
1573 >ON893947.1:21524-25345 3822
1574 >ON893948.1:21524-25345 3822
1575 >ON893953.1:21524-25345 3822
1576 >ON893955.1:21524-25345 3822
1577 >ON893956.1:21524-25345 3822
1578 >ON893959.1:21524-25345 3822
1579 >ON893960.1:21524-25345 3822
1580 >ON893961.1:21524-25345 3822
1581 >ON893963.1:21524-25345 3822
1582 >ON893965.1:21524-25345 3822
1583 >ON898619.1:21552-25364 3813
1584 >ON898626.1:21543-25349 3807
1585 >ON898637.1:21554-25366 3813
1586 >ON898655.1:21545-25351 3807
1587 >ON898743.1:21552-25358 3807
1588 >ON898746.1:21543-25349 3807
1589 >ON898803.1:21552-25364 3813
1590 >ON898935.1:21552-25364 3813
1591 >ON899089

2015 >OM443389.1:21530-25342 3813
2016 >OM443429.1:21514-25326 3813
2017 >OM443450.1:21514-25326 3813
2018 >OM443608.1:21539-25351 3813
2019 >OM446060.1:21514-25317 3804
2020 >OM446158.1:21540-25352 3813
2021 >OM446200.1:21514-25326 3813
2022 >OM446242.1:21514-25326 3813
2023 >OM446392.1:21547-25359 3813
2024 >OM446430.1:21514-25326 3813
2025 >OM446446.1:21547-25359 3813
2026 >OM446500.1:21514-25326 3813
2027 >OM446589.1:21514-25326 3813
2028 >OM446803.1:21547-25359 3813
2029 >OM446844.1:21514-25326 3813
2030 >OM446883.1:21514-25326 3813
2031 >OM446955.1:21514-25326 3813
2032 >OM447181.1:21514-25326 3813
2033 >OM447222.1:21547-25359 3813
2034 >OM447245.1:21514-25326 3813
2035 >OM447322.1:21514-25326 3813
2036 >OM447402.1:21514-25326 3813
2037 >OM447446.1:21514-25326 3813
2038 >OM447820.1:21514-25326 3813
2039 >OM448583.1:21523-25335 3813
2040 >OM449129.1:21520-25332 3813
2041 >OM450756.1:21526-25338 3813
2042 >OM450777.1:21526-25338 3813
2043 >OM455572.1:21497-25309 3813
2044 >OM455635

2471 >OL841830.1:21509-25324 3816
2472 >OL846028.1:21509-25324 3816
2473 >OL846037.1:21509-25324 3816
2474 >OL846046.1:21509-25324 3816
2475 >OL860811.1:21538-25353 3816
2476 >OL865810.1:21509-25324 3816
2477 >OL865993.1:21509-25324 3816
2478 >OL866047.1:21509-25324 3816
2479 >OL866058.1:21509-25324 3816
2480 >OL882162.1:21524-25339 3816
2481 >OL885627.1:21538-25353 3816
2482 >OL887113.1:21531-25346 3816
2483 >OL887706.1:21538-25353 3816
2484 >OL888422.1:21538-25353 3816
2485 >OL889159.1:21538-25353 3816
2486 >OL896363.1:21561-25376 3816
2487 >OL896502.1:21561-25376 3816
2488 >OL896599.1:21563-25378 3816
2489 >OL896739.1:21563-25378 3816
2490 >OL898844.1:21526-25338 3813
2491 >OL906153.1:21538-25353 3816
2492 >OL906155.1:21526-25341 3816
2493 >OL906156.1:21526-25341 3816
2494 >OL906158.1:21526-25341 3816
2495 >OL906160.1:21526-25341 3816
2496 >OL906163.1:21526-25341 3816
2497 >OL915379.1:21428-25243 3816
2498 >OL915436.1:21476-25291 3816
2499 >OL921608.1:21538-25353 3816
2500 >OL922539

2891 >OM258562.1:21509-25324 3816
2892 >OM260365.1:21526-25338 3813
2893 >OM260377.1:21526-25347 3822
2894 >OM260523.1:21526-25338 3813
2895 >OM260576.1:21526-25338 3813
2896 >OM260588.1:21526-25338 3813
2897 >OM260913.1:21526-25338 3813
2898 >OM260935.1:21526-25338 3813
2899 >OM260948.1:21526-25338 3813
2900 >OM261047.1:21526-25338 3813
2901 >OM261300.1:21526-25338 3813
2902 >OM261310.1:21526-25338 3813
2903 >OM261454.1:21526-25338 3813
2904 >OM261617.1:21526-25338 3813
2905 >OM261638.1:21526-25338 3813
2906 >OM261640.1:21526-25338 3813
2907 >OM261889.1:21526-25338 3813
2908 >OM270042.1:21521-25336 3816
2909 >OM270634.1:21521-25336 3816
2910 >OM270642.1:21450-25262 3813
2911 >OM272071.1:21526-25338 3813
2912 >OM272104.1:21526-25338 3813
2913 >OM272248.1:21526-25338 3813
2914 >OM272504.1:21526-25338 3813
2915 >OM272563.1:21526-25338 3813
2916 >OM272648.1:21526-25338 3813
2917 >OM282864.1:21509-25324 3816
2918 >OM282867.1:21509-25324 3816
2919 >OM282887.1:21523-25338 3816
2920 >OM282895

3327 >MZ714911.1:21500-25312 3813
3328 >MZ727758.1:21494-25315 3822
3329 >MZ740809.1:21533-25348 3816
3330 >MZ744601.1:21533-25348 3816
3331 >MZ747305.1:21530-25351 3822
3332 >MZ750317.1:21524-25336 3813
3333 >MZ785823.1:21515-25327 3813
3334 >MZ848801.1:21419-25231 3813
3335 >MZ881659.1:21512-25324 3813
3336 >OK007358.1:21523-25344 3822
3337 >OK022969.1:21533-25348 3816
3338 >OK125078.1:21515-25327 3813
3339 >OK125244.1:21482-25303 3822
3340 >OK128830.1:21500-25312 3813
3341 >OK129136.1:21500-25312 3813
3342 >OK191262.1:21524-25336 3813
3343 >OK191625.1:21533-25354 3822
3344 >OK192832.1:21524-25336 3813
3345 >OK230601.1:21515-25327 3813
3346 >OK231166.1:21452-25273 3822
3347 >OK235125.1:21460-25281 3822
3348 >OK248136.1:21500-25312 3813
3349 >OK248308.1:21500-25312 3813
3350 >OK248811.1:21500-25312 3813
3351 >OK249168.1:21500-25312 3813
3352 >OK249441.1:21500-25312 3813
3353 >OK249585.1:21500-25321 3822
3354 >OK249689.1:21500-25312 3813
3355 >OK252916.1:21497-25309 3813
3356 >OK291653

3765 >MW846045.1:21509-25330 3822
3766 >MW846049.1:21509-25330 3822
3767 >MW846050.1:21517-25338 3822
3768 >MW846052.1:21509-25330 3822
3769 >MW846053.1:21509-25330 3822
3770 >MW846054.1:21509-25330 3822
3771 >MW846055.1:21509-25330 3822
3772 >MW846056.1:21509-25330 3822
3773 >MW846057.1:21509-25330 3822
3774 >MW846058.1:21509-25330 3822
3775 >MW846059.1:21509-25330 3822
3776 >MW846061.1:21509-25330 3822
3777 >MW846062.1:21509-25330 3822
3778 >MW846064.1:21509-25330 3822
3779 >MW846065.1:21509-25330 3822
3780 >MW846066.1:21509-25330 3822
3781 >MW846067.1:21509-25330 3822
3782 >MW846068.1:21509-25330 3822
3783 >MW846069.1:21509-25330 3822
3784 >MW846071.1:21521-25342 3822
3785 >MW846074.1:21509-25330 3822
3786 >MW846075.1:21509-25330 3822
3787 >MW846076.1:21509-25330 3822
3788 >MW846077.1:21509-25330 3822
3789 >MW849028.1:21509-25330 3822
3790 >MW849207.1:21509-25330 3822
3791 >MW849217.1:21509-25330 3822
3792 >MW851085.1:21527-25348 3822
3793 >MW851817.1:21537-25358 3822
3794 >MW855728

4207 >MW747950.1:21509-25330 3822
4208 >MW747990.1:21509-25330 3822
4209 >MW747991.1:21509-25330 3822
4210 >MW747993.1:21509-25330 3822
4211 >MW747994.1:21509-25330 3822
4212 >MW749350.1:21509-25330 3822
4213 >MW749582.1:21509-25330 3822
4214 >MW749634.1:21509-25330 3822
4215 >MW749636.1:21509-25330 3822
4216 >MW752907.1:21509-25330 3822
4217 >MW752908.1:21509-25330 3822
4218 >MW752909.1:21509-25330 3822
4219 >MW752910.1:21509-25330 3822
4220 >MW752911.1:21509-25330 3822
4221 >MW752913.1:21509-25330 3822
4222 >MW752914.1:21509-25330 3822
4223 >MW752916.1:21509-25330 3822
4224 >MW752917.1:21509-25330 3822
4225 >MW752918.1:21509-25330 3822
4226 >MW752919.1:21509-25330 3822
4227 >MW752920.1:21509-25330 3822
4228 >MW752921.1:21509-25330 3822
4229 >MW752922.1:21509-25330 3822
4230 >MW752923.1:21509-25330 3822
4231 >MW752926.1:21509-25330 3822
4232 >MW752928.1:21509-25330 3822
4233 >MW752934.1:21509-25330 3822
4234 >MW752937.1:21509-25330 3822
4235 >MW752948.1:215

In [18]:
!awk '/surface glycoprotein/{flag=1} flag==1 && /^>/ && !/surface glycoprotein/{flag=0} flag==1{print}' < cds_US.fna > cds_S_US.fna


In [19]:
!awk '/>/{if(NR>1){print m,nam,ln}; m++; nam=$1;ln=0} !/>/{ln+=length($0)} END {print m,nam,ln}' < cds_S_US.fna

1 >OR306471.1:21527-25348 3822
2 >OR306526.1:21527-25348 3822
3 >OR306536.1:21525-25346 3822
4 >OR306589.1:21519-25340 3822
5 >OR306758.1:21527-25348 3822
6 >OR306792.1:21525-25346 3822
7 >OR306795.1:21525-25346 3822
8 >OR306806.1:21525-25346 3822
9 >OR306844.1:21525-25346 3822
10 >OR306899.1:21533-25354 3822
11 >OR306911.1:21525-25346 3822
12 >OR306940.1:21525-25346 3822
13 >OR307013.1:21530-25351 3822
14 >OR307014.1:21526-25347 3822
15 >OR307223.1:21533-25354 3822
16 >OR307244.1:21527-25348 3822
17 >OR307279.1:21533-25354 3822
18 >OR307310.1:21532-25353 3822
19 >OR307410.1:21525-25346 3822
20 >OR307416.1:21525-25346 3822
21 >OR307418.1:21547-25368 3822
22 >OR307441.1:21525-25346 3822
23 >OR307449.1:21525-25346 3822
24 >OR307453.1:21525-25346 3822
25 >OR307526.1:21527-25348 3822
26 >OR307537.1:21525-25346 3822
27 >OR307667.1:21533-25354 3822
28 >OR307777.1:21526-25347 3822
29 >OR307936.1:21555-25376 3822
30 >OR307953.1:21525-25346 3822
31 >OR307959.1:21526-25347 3822
32 >OR307997.1:21

421 >PP602158.1:21509-25330 3822
422 >PP602160.1:21509-25330 3822
423 >PP602237.1:21509-25330 3822
424 >PP603295.1:21525-25346 3822
425 >PP603302.1:21525-25346 3822
426 >PP603305.1:21525-25346 3822
427 >PP603317.1:21525-25346 3822
428 >PP603348.1:21525-25346 3822
429 >PP604037.1:21516-25334 3819
430 >PP604466.1:21528-25349 3822
431 >PP604537.1:21527-25348 3822
432 >PP604574.1:21516-25337 3822
433 >PP605036.1:21516-25334 3819
434 >PP605767.1:21525-25346 3822
435 >PP605771.1:21516-25334 3819
436 >PP606438.1:21525-25346 3822
437 >PP606790.1:21525-25346 3822
438 >PP606858.1:21516-25337 3822
439 >PP606883.1:21515-25336 3822
440 >PP607024.1:21515-25336 3822
441 >PP607048.1:21515-25336 3822
442 >PP607166.1:21516-25337 3822
443 >PP608548.1:21516-25337 3822
444 >PP609521.1:21516-25337 3822
445 >PP609857.1:21516-25337 3822
446 >PP609967.1:21516-25337 3822
447 >PP610249.1:21516-25334 3819
448 >PP612255.1:21516-25334 3819
449 >PP613789.1:21516-25337 3822
450 >PP614249.1:21525-25340 3816
451 >PP614

855 >OR056884.1:21503-25318 3816
856 >OR086136.1:21497-25309 3813
857 >OR177626.1:21526-25347 3822
858 >OR180384.1:21525-25340 3816
859 >OR180443.1:21513-25325 3813
860 >OR212108.1:21529-25335 3807
861 >OR212150.1:21529-25335 3807
862 >OR213601.1:21529-25335 3807
863 >OR299413.1:21527-25348 3822
864 >OR299619.1:21527-25348 3822
865 >OR299634.1:21527-25348 3822
866 >OR299650.1:21526-25347 3822
867 >OR299757.1:21526-25347 3822
868 >OR299788.1:21561-25382 3822
869 >OR299802.1:21558-25379 3822
870 >OR299809.1:21547-25368 3822
871 >OR299852.1:21544-25365 3822
872 >OR299929.1:21552-25373 3822
873 >OR299948.1:21559-25380 3822
874 >OR299976.1:21525-25346 3822
875 >OR300004.1:21556-25377 3822
876 >OR300033.1:21547-25368 3822
877 >OR300117.1:21525-25346 3822
878 >OR300291.1:21527-25348 3822
879 >OR300328.1:21530-25351 3822
880 >OR300406.1:21529-25350 3822
881 >OR300435.1:21563-25363 3801
882 >OR300440.1:21527-25348 3822
883 >OR300444.1:21526-25347 3822
884 >OR300456.1:21527-25348 3822
885 >OR300

1309 >OP567183.1:21531-25337 3807
1310 >OP567853.1:21504-25310 3807
1311 >OP568145.1:21504-25310 3807
1312 >OP568188.1:21504-25310 3807
1313 >OP568240.1:21504-25310 3807
1314 >OP568380.1:21504-25307 3804
1315 >OP568826.1:21504-25310 3807
1316 >OP569825.1:21495-25301 3807
1317 >OP570025.1:21504-25310 3807
1318 >OP570462.1:21554-25360 3807
1319 >OP570545.1:21552-25358 3807
1320 >OP570557.1:21552-25358 3807
1321 >OP570672.1:21552-25364 3813
1322 >OP571194.1:21552-25358 3807
1323 >OP571302.1:21504-25310 3807
1324 >OP571559.1:21504-25310 3807
1325 >OP571681.1:21504-25310 3807
1326 >OP576343.1:21495-25301 3807
1327 >OP577027.1:21504-25310 3807
1328 >OP578313.1:21524-25345 3822
1329 >OP579698.1:21504-25310 3807
1330 >OP582738.1:21515-25321 3807
1331 >OP590789.1:21553-25359 3807
1332 >OP591096.1:21552-25358 3807
1333 >OP591278.1:21552-25358 3807
1334 >OP591315.1:21552-25358 3807
1335 >OP592040.1:21529-25335 3807
1336 >OP607699.1:21552-25358 3807
1337 >OP608842.1:21495-25301 3807
1338 >OP608994

1755 >ON917182.1:21506-25312 3807
1756 >ON917259.1:21443-25249 3807
1757 >ON917996.1:21515-25321 3807
1758 >ON918020.1:21482-25294 3813
1759 >ON918472.1:21444-25250 3807
1760 >ON923573.1:21529-25341 3813
1761 >ON925061.1:21560-25375 3816
1762 >ON928021.1:21551-25363 3813
1763 >ON928046.1:21551-25363 3813
1764 >ON928149.1:21551-25363 3813
1765 >ON938667.1:21504-25310 3807
1766 >ON939292.1:21554-25366 3813
1767 >ON940897.1:21419-25225 3807
1768 >ON941147.1:21452-25264 3813
1769 >ON942259.2:21529-25341 3813
1770 >ON944230.1:21490-25302 3813
1771 >ON947218.1:21505-25317 3813
1772 >ON947384.1:21532-25344 3813
1773 >ON947405.1:21531-25343 3813
1774 >ON949136.1:21482-25288 3807
1775 >ON949155.1:21473-25279 3807
1776 >ON949568.1:21452-25258 3807
1777 >ON949737.1:21515-25327 3813
1778 >ON951878.1:21379-25191 3813
1779 >ON952220.1:21495-25301 3807
1780 >ON954079.1:21503-25318 3816
1781 >ON954176.1:21499-25314 3816
1782 >ON957024.1:21553-25359 3807
1783 >ON957081.1:21545-25351 3807
1784 >ON957208

2204 >ON336300.1:21504-25316 3813
2205 >ON336360.1:21504-25316 3813
2206 >ON336465.1:21504-25316 3813
2207 >ON336557.1:21504-25316 3813
2208 >ON337663.1:21485-25297 3813
2209 >ON338281.1:21494-25309 3816
2210 >ON339029.1:21549-25361 3813
2211 >ON339032.1:21538-25353 3816
2212 >ON339061.1:21538-25353 3816
2213 >ON339393.1:21495-25307 3813
2214 >ON339551.1:21563-25384 3822
2215 >ON339557.1:21560-25381 3822
2216 >ON339604.1:21553-25374 3822
2217 >ON339608.1:21561-25382 3822
2218 >ON339611.1:21563-25384 3822
2219 >ON339770.1:21538-25359 3822
2220 >ON339823.1:21559-25380 3822
2221 >ON339842.1:21538-25359 3822
2222 >ON339887.1:21561-25382 3822
2223 >ON339912.1:21559-25380 3822
2224 >ON339941.1:21556-25377 3822
2225 >ON339949.1:21558-25379 3822
2226 >ON339951.1:21562-25383 3822
2227 >ON340021.1:21559-25380 3822
2228 >ON340082.1:21561-25382 3822
2229 >ON340134.1:21561-25382 3822
2230 >ON340164.1:21560-25381 3822
2231 >ON340171.1:21538-25359 3822
2232 >ON340202.1:21538-25359 3822
2233 >ON340236

2640 >OM636173.1:21513-25325 3813
2641 >OM636195.1:21509-25324 3816
2642 >OM637011.1:21509-25321 3813
2643 >OM638978.1:21501-25313 3813
2644 >OM642904.1:21555-25370 3816
2645 >OM644007.1:21526-25338 3813
2646 >OM644876.1:21526-25338 3813
2647 >OM645599.1:21509-25321 3813
2648 >OM646130.1:21509-25321 3813
2649 >OM646258.1:21509-25321 3813
2650 >OM646362.1:21509-25321 3813
2651 >OM648539.1:21497-25300 3804
2652 >OM651723.1:21514-25326 3813
2653 >OM652247.1:21514-25326 3813
2654 >OM658846.1:21547-25359 3813
2655 >OM660357.1:21509-25321 3813
2656 >OM662062.1:21501-25313 3813
2657 >OM662098.1:21501-25313 3813
2658 >OM662222.1:21501-25313 3813
2659 >OM663027.1:21539-25351 3813
2660 >OM663517.1:21497-25309 3813
2661 >OM664372.1:21551-25363 3813
2662 >OM665735.1:21501-25313 3813
2663 >OM666999.1:21501-25313 3813
2664 >OM669072.1:21514-25326 3813
2665 >OM669146.1:21523-25335 3813
2666 >OM674294.1:21541-25356 3816
2667 >OM676088.1:21538-25350 3813
2668 >OM676630.1:21509-25324 3816
2669 >OM680038

3074 >OM082631.1:21514-25329 3816
3075 >OM085319.1:21509-25324 3816
3076 >OM085441.1:21509-25324 3816
3077 >OM085646.1:21497-25309 3813
3078 >OM085785.1:21514-25317 3804
3079 >OM086405.1:21539-25354 3816
3080 >OM086645.1:21514-25326 3813
3081 >OM087087.1:21514-25317 3804
3082 >OM087348.1:21514-25326 3813
3083 >OM087666.1:21514-25326 3813
3084 >OM087712.1:21514-25326 3813
3085 >OM088003.1:21514-25326 3813
3086 >OM088116.1:21514-25326 3813
3087 >OM090244.1:21538-25353 3816
3088 >OM090810.1:21526-25341 3816
3089 >OM090900.1:21553-25368 3816
3090 >OM091068.1:21547-25359 3813
3091 >OM091708.1:21514-25317 3804
3092 >OM092190.1:21514-25326 3813
3093 >OM092519.1:21547-25359 3813
3094 >OM093538.1:21514-25326 3813
3095 >OM093610.1:21514-25326 3813
3096 >OM093789.1:21547-25359 3813
3097 >OM093810.1:21514-25326 3813
3098 >OM094350.1:21526-25341 3816
3099 >OM094352.1:21530-25342 3813
3100 >OM094895.1:21514-25326 3813
3101 >OM094972.1:21563-25384 3822
3102 >OM095004.1:21563-25384 3822
3103 >OM095018

3513 >OL752367.1:21561-25376 3816
3514 >OL753827.1:21521-25336 3816
3515 >OL754309.1:21521-25336 3816
3516 >OL754420.1:21521-25336 3816
3517 >OL756400.1:21525-25346 3822
3518 >OL756554.1:21524-25345 3822
3519 >OL756592.1:21523-25344 3822
3520 >OL756631.1:21518-25339 3822
3521 >OL756745.1:21525-25346 3822
3522 >OL756752.1:21525-25346 3822
3523 >OL756860.1:21524-25345 3822
3524 >OL756953.1:21547-25368 3822
3525 >OL756987.1:21514-25335 3822
3526 >OL756988.1:21516-25337 3822
3527 >OL757003.1:21518-25339 3822
3528 >OL757018.1:21523-25344 3822
3529 >OL757025.1:21516-25337 3822
3530 >OL757031.1:21524-25345 3822
3531 >OL757722.1:21533-25348 3816
3532 >OL758225.1:21517-25332 3816
3533 >OL759050.1:21517-25332 3816
3534 >OL759078.1:21517-25332 3816
3535 >OL759272.1:21538-25353 3816
3536 >OL759345.1:21538-25353 3816
3537 >OL759395.1:21535-25350 3816
3538 >OL759874.1:21538-25353 3816
3539 >OL760502.1:21538-25353 3816
3540 >OL760626.1:21532-25347 3816
3541 >OL760870.1:21538-25353 3816
3542 >OL761049

3956 >OM011495.1:21521-25336 3816
3957 >OM012228.1:21526-25341 3816
3958 >OM012589.1:21526-25341 3816
3959 >OM012803.1:21514-25317 3804
3960 >OM012906.1:21538-25350 3813
3961 >OM013509.1:21514-25326 3813
3962 >OM013962.1:21559-25374 3816
3963 >OM014270.1:21509-25324 3816
3964 >OM014438.1:21509-25324 3816
3965 >OM014473.1:21554-25369 3816
3966 >OM014730.1:21509-25324 3816
3967 >OM015172.1:21509-25324 3816
3968 >OM015913.1:21538-25353 3816
3969 >OM016187.1:21526-25338 3813
3970 >OM016266.1:21538-25353 3816
3971 >OM016548.1:21538-25353 3816
3972 >OM016586.1:21526-25338 3813
3973 >MZ473180.1:21533-25354 3822
3974 >MZ473188.1:21533-25354 3822
3975 >MZ473205.1:21533-25354 3822
3976 >MZ473379.1:21509-25330 3822
3977 >MZ473404.1:21509-25330 3822
3978 >MZ473407.1:21509-25330 3822
3979 >MZ473466.1:21509-25330 3822
3980 >MZ473556.1:21509-25330 3822
3981 >MZ473747.1:21509-25330 3822
3982 >MZ473752.1:21509-25330 3822
3983 >MZ473867.1:21509-25330 3822
3984 >MZ473870.1:21509-25330 3822
3985 >MZ473871

4403 >OL697227.1:21524-25339 3816
4404 >OL698851.1:21563-25384 3822
4405 >OL701589.1:21512-25333 3822
4406 >OL701626.1:21512-25333 3822
4407 >OL701728.1:21533-25354 3822
4408 >OL701777.1:21533-25354 3822
4409 >OL701824.1:21533-25354 3822
4410 >OL704419.1:21525-25346 3822
4411 >OL705507.1:21560-25375 3816
4412 >OL705855.1:21525-25340 3816
4413 >OL706674.1:21526-25341 3816
4414 >OL706964.1:21509-25330 3822
4415 >OL707453.1:21509-25324 3816
4416 >OL707528.1:21509-25324 3816
4417 >OL708016.1:21521-25336 3816
4418 >OL708050.1:21521-25336 3816
4419 >OL708093.1:21521-25336 3816
4420 >OL708165.1:21521-25336 3816
4421 >OL708168.1:21521-25336 3816
4422 >OL708303.1:21521-25336 3816
4423 >OL708892.1:21509-25324 3816
4424 >OL709071.1:21509-25324 3816
4425 >OL709317.1:21509-25324 3816
4426 >OL710562.1:21538-25353 3816
4427 >OL710749.1:21526-25341 3816
4428 >OL711153.1:21526-25341 3816
4429 >OL711177.1:21526-25341 3816
4430 >OL711564.1:21526-25341 3816
4431 >OL714319.1:21517-25332 3816
4432 >OL714471

4852 >MZ350662.1:21509-25330 3822
4853 >MZ350705.1:21532-25353 3822
4854 >MZ351820.1:21517-25329 3813
4855 >MZ354003.1:21500-25312 3813
4856 >MZ354247.1:21519-25340 3822
4857 >MZ354480.1:21524-25345 3822
4858 >MZ354530.1:21524-25336 3813
4859 >MZ354760.1:21512-25324 3813
4860 >MZ358639.1:21551-25363 3813
4861 >MZ358738.1:21553-25365 3813
4862 >MZ362907.1:21524-25336 3813
4863 >MZ363784.1:21514-25335 3822
4864 >MZ363800.1:21546-25367 3822
4865 >MZ363804.1:21533-25354 3822
4866 >MZ367649.1:21542-25363 3822
4867 >MZ371332.1:21515-25327 3813
4868 >MZ371502.1:21515-25327 3813
4869 >MZ372100.1:21515-25336 3822
4870 >MZ372490.1:21515-25327 3813
4871 >MZ373363.1:21531-25352 3822
4872 >MZ373521.1:21533-25354 3822
4873 >MZ373541.1:21548-25369 3822
4874 >MZ373666.1:21527-25348 3822
4875 >MZ373681.1:21534-25355 3822
4876 >MZ373690.1:21527-25348 3822
4877 >MZ373831.1:21533-25354 3822
4878 >MZ373899.1:21528-25349 3822
4879 >MZ373913.1:21533-25354 3822
4880 >MZ376028.1:21554-25366 3813
4881 >MZ377573

5302 >MW863792.1:21525-25346 3822
5303 >MW863794.1:21560-25381 3822
5304 >MW863807.1:21560-25381 3822
5305 >MW863947.1:21560-25381 3822
5306 >MW863965.1:21559-25380 3822
5307 >MW863978.1:21559-25380 3822
5308 >MW864345.1:21509-25330 3822
5309 >MW864877.1:21500-25312 3813
5310 >MW864934.1:21500-25312 3813
5311 >MW865625.1:21500-25312 3813
5312 >MW868450.1:21532-25353 3822
5313 >MW868453.1:21533-25354 3822
5314 >MW868699.1:21553-25374 3822
5315 >MW869254.1:21554-25366 3813
5316 >MW870338.1:21509-25330 3822
5317 >MW876869.1:21545-25366 3822
5318 >MW876988.1:21564-25385 3822
5319 >MW877048.1:21520-25332 3813
5320 >MW880568.1:21500-25321 3822
5321 >MW880581.1:21500-25321 3822
5322 >MW880594.1:21509-25330 3822
5323 >MW882464.1:21535-25356 3822
5324 >MW882600.1:21536-25357 3822
5325 >MW882613.1:21536-25357 3822
5326 >MW882689.1:21538-25359 3822
5327 >MW882744.1:21538-25359 3822
5328 >MW882772.1:21509-25330 3822
5329 >MW883220.1:21500-25321 3822
5330 >MW883252.1:21561-25382 3822
5331 >MW883255

5745 >MW065074.1:21509-25330 3822
5746 >MW065113.1:21509-25330 3822
5747 >MW065237.1:21509-25330 3822
5748 >MW065257.1:21509-25330 3822
5749 >MW065305.1:21509-25330 3822
5750 >MW065381.1:21509-25330 3822
5751 >MW065403.1:21509-25330 3822
5752 >MW065415.1:21509-25330 3822
5753 >MW065440.1:21509-25330 3822
5754 >MW065445.1:21509-25330 3822
5755 >MW067701.1:21560-25381 3822
5756 >MW067705.1:21559-25380 3822
5757 >MW067714.1:21553-25374 3822
5758 >MW067805.1:21525-25346 3822
5759 >MW067806.1:21563-25384 3822
5760 >MW070041.1:21527-25348 3822
5761 >MW077476.1:21548-25369 3822
5762 >MW079836.1:21549-25370 3822
5763 >MW086915.1:21525-25346 3822
5764 >MW134016.1:21546-25367 3822
5765 >MW134126.1:21358-25179 3822
5766 >MW134333.1:21561-25382 3822
5767 >MW136860.1:21533-25354 3822
5768 >MW136863.1:21525-25346 3822
5769 >MW166145.1:21561-25382 3822
5770 >MW190216.1:21509-25330 3822
5771 >MW190267.1:21509-25330 3822
5772 >MW190287.1:21509-25330 3822
5773 >MW190350.1:21509-25330 3822
5774 >MW190356

In [21]:
!awk '/surface glycoprotein/{flag=1} flag==1 && /^>/ && !/surface glycoprotein/{flag=0} flag==1{print}' < cds_AZ.fna > cds_S_AZ.fna


In [22]:
!awk '/>/{if(NR>1){print m,nam,ln}; m++; nam=$1;ln=0} !/>/{ln+=length($0)} END {print m,nam,ln}' < cds_S_AZ.fna

1 >OR517517.1:21504-25310 3807
2 >OR558475.1:21504-25316 3813
3 >OR558600.1:21504-25313 3810
4 >OR559014.1:21554-25363 3810
5 >OR559535.1:21515-25324 3810
6 >OR564236.1:21504-25313 3810
7 >OR564238.1:21504-25316 3813
8 >OR564260.1:21504-25313 3810
9 >OR564308.1:21504-25313 3810
10 >OR569218.1:21515-25306 3792
11 >OR599659.1:21504-25313 3810
12 >OR599681.1:21504-25313 3810
13 >OR599683.1:21504-25313 3810
14 >OR619044.1:21504-25310 3807
15 >OR619205.1:21504-25313 3810
16 >OR624251.1:21504-25313 3810
17 >OR624297.1:21504-25313 3810
18 >OR649370.1:21452-25261 3810
19 >OR662235.1:21504-25316 3813
20 >OR681592.1:21452-25261 3810
21 >OR681631.1:21482-25291 3810
22 >OR681685.1:21515-25324 3810
23 >OR708403.1:21504-25313 3810
24 >OR708452.1:21504-25310 3807
25 >OR816353.1:21504-25313 3810
26 >OR816965.1:21504-25313 3810
27 >OR832482.1:21504-25313 3810
28 >OR833622.1:21515-25324 3810
29 >OR857897.1:21504-25313 3810
30 >OR857925.1:21504-25313 3810
31 >OR858110.1:21504-25313 3810
32 >OR864432.1:21

403 >PP581441.1:21495-25304 3810
404 >PP581637.1:21515-25324 3810
405 >PP581650.1:21482-25291 3810
406 >PP581657.1:21452-25261 3810
407 >PP581664.1:21482-25291 3810
408 >PP584651.1:21500-25309 3810
409 >PP600354.1:21482-25291 3810
410 >PP600355.1:21482-25291 3810
411 >PP600357.1:21476-25285 3810
412 >PP600358.1:21419-25228 3810
413 >PP600367.1:21470-25279 3810
414 >PP600368.1:21419-25228 3810
415 >PP600465.1:21482-25291 3810
416 >PP663918.1:21515-25324 3810
417 >PP709074.1:21495-25304 3810
418 >PP709077.1:21498-25307 3810
419 >PP709084.1:21504-25313 3810
420 >PP709085.1:21504-25313 3810
421 >PP709089.1:21504-25313 3810
422 >PP709098.1:21504-25313 3810
423 >PP709102.1:21504-25313 3810
424 >PP709123.1:21504-25313 3810
425 >PP709125.1:21504-25313 3810
426 >PP709143.1:21504-25313 3810
427 >PP716618.1:21515-25324 3810
428 >PP716622.1:21515-25324 3810
429 >PP716624.1:21515-25324 3810
430 >PP716625.1:21515-25324 3810
431 >PP716629.1:21452-25261 3810
432 >PP741889.1:21482-25291 3810
433 >PP762

820 >OQ619500.1:21523-25329 3807
821 >OQ619501.1:21510-25316 3807
822 >OQ619506.1:21520-25326 3807
823 >OQ619509.1:21505-25311 3807
824 >OQ619511.1:21515-25321 3807
825 >OQ619527.1:21490-25296 3807
826 >OQ619535.1:21547-25368 3822
827 >OQ619541.1:21505-25311 3807
828 >OQ619549.1:21509-25315 3807
829 >OQ619552.1:21538-25344 3807
830 >OQ619553.1:21546-25367 3822
831 >OQ619579.1:21529-25350 3822
832 >OQ619582.1:21507-25313 3807
833 >OQ619589.1:21524-25345 3822
834 >OQ619590.1:21529-25350 3822
835 >OQ619595.1:21519-25325 3807
836 >OQ619598.1:21532-25353 3822
837 >OQ619601.1:21519-25325 3807
838 >OQ619607.1:21523-25326 3804
839 >OQ619616.1:21520-25326 3807
840 >OQ619631.1:21546-25367 3822
841 >OQ619638.1:21516-25337 3822
842 >OQ619645.1:21505-25311 3807
843 >OQ619648.1:21523-25329 3807
844 >OQ619679.1:21533-25339 3807
845 >OQ619714.1:21518-25324 3807
846 >OQ619719.1:21507-25313 3807
847 >OQ619722.1:21527-25333 3807
848 >OQ619740.1:21523-25329 3807
849 >OQ619744.1:21553-25374 3822
850 >OQ619

1273 >OQ071881.1:21514-25320 3807
1274 >OQ071911.1:21515-25321 3807
1275 >OQ071998.1:21515-25321 3807
1276 >OQ071999.1:21515-25327 3813
1277 >OQ072010.1:21500-25306 3807
1278 >OQ073072.1:21515-25318 3804
1279 >OQ074033.1:21554-25360 3807
1280 >OQ074105.1:21554-25366 3813
1281 >OQ074282.1:21554-25366 3813
1282 >OQ074329.1:21554-25363 3810
1283 >OQ074331.1:21552-25358 3807
1284 >OQ074430.1:21554-25366 3813
1285 >OQ074436.1:21554-25360 3807
1286 >OQ074536.1:21504-25310 3807
1287 >OQ074714.1:21504-25316 3813
1288 >OQ074804.1:21504-25310 3807
1289 >OQ074822.1:21504-25310 3807
1290 >OQ074858.1:21504-25310 3807
1291 >OQ080273.1:21482-25294 3813
1292 >OQ080953.1:21452-25252 3801
1293 >OQ080974.1:21452-25255 3804
1294 >OQ081046.1:21515-25321 3807
1295 >OQ103746.1:21552-25358 3807
1296 >OQ103886.1:21554-25360 3807
1297 >OQ103914.1:21551-25357 3807
1298 >OQ103937.1:21554-25360 3807
1299 >OQ109401.1:21515-25327 3813
1300 >OQ109486.1:21482-25288 3807
1301 >OQ114459.1:21554-25357 3804
1302 >OQ114798

1717 >OP249117.1:21513-25325 3813
1718 >OP249122.1:21505-25317 3813
1719 >OP249148.1:21532-25353 3822
1720 >OP249174.1:21538-25350 3813
1721 >OP249180.1:21507-25319 3813
1722 >OP249187.1:21505-25317 3813
1723 >OP249197.1:21514-25335 3822
1724 >OP249199.1:21505-25317 3813
1725 >OP249209.1:21542-25354 3813
1726 >OP249222.1:21463-25269 3807
1727 >OP249223.1:21485-25297 3813
1728 >OP249224.1:21524-25336 3813
1729 >OP249228.1:21505-25317 3813
1730 >OP249240.1:21505-25317 3813
1731 >OP249254.1:21523-25335 3813
1732 >OP249260.1:21505-25317 3813
1733 >OP249265.1:21515-25327 3813
1734 >OP249273.1:21510-25322 3813
1735 >OP249289.1:21544-25356 3813
1736 >OP249290.1:21547-25353 3807
1737 >OP249291.1:21547-25368 3822
1738 >OP249295.1:21532-25353 3822
1739 >OP249298.1:21518-25324 3807
1740 >OP249311.1:21507-25313 3807
1741 >OP249312.1:21514-25335 3822
1742 >OP249313.1:21507-25319 3813
1743 >OP249329.1:21542-25348 3807
1744 >OP249363.1:21514-25326 3813
1745 >OP249370.1:21520-25332 3813
1746 >OP249371

2146 >ON518916.1:21488-25300 3813
2147 >ON518943.1:21524-25336 3813
2148 >ON518952.1:21547-25359 3813
2149 >ON518976.1:21536-25348 3813
2150 >ON519006.1:21502-25314 3813
2151 >ON519025.1:21530-25342 3813
2152 >ON519047.1:21535-25347 3813
2153 >ON519054.1:21513-25325 3813
2154 >ON519779.1:21552-25364 3813
2155 >ON519800.1:21554-25375 3822
2156 >ON520409.1:21554-25366 3813
2157 >ON520566.1:21554-25366 3813
2158 >ON522174.1:21504-25316 3813
2159 >ON522260.1:21504-25316 3813
2160 >ON522285.1:21504-25316 3813
2161 >ON522357.1:21501-25313 3813
2162 >ON522406.1:21504-25316 3813
2163 >ON522457.1:21504-25316 3813
2164 >ON522470.1:21504-25316 3813
2165 >ON523216.1:21504-25316 3813
2166 >ON534414.1:21482-25294 3813
2167 >ON541189.1:21502-25314 3813
2168 >ON541203.1:21504-25316 3813
2169 >ON541211.1:21542-25357 3816
2170 >ON541226.1:21515-25327 3813
2171 >ON541235.1:21513-25325 3813
2172 >ON541237.1:21519-25331 3813
2173 >ON541246.1:21502-25314 3813
2174 >ON541247.1:21488-25300 3813
2175 >ON541304

2565 >ON472463.1:21557-25372 3816
2566 >ON472468.1:21551-25366 3816
2567 >ON472472.1:21532-25347 3816
2568 >ON472479.1:21531-25346 3816
2569 >ON472488.1:21532-25347 3816
2570 >ON472490.1:21559-25374 3816
2571 >ON472496.1:21559-25374 3816
2572 >ON472504.1:21532-25347 3816
2573 >ON472547.1:21555-25370 3816
2574 >ON472560.1:21532-25347 3816
2575 >ON472561.1:21551-25366 3816
2576 >ON472574.1:21536-25351 3816
2577 >ON472581.1:21553-25368 3816
2578 >ON472585.1:21530-25345 3816
2579 >ON472586.1:21555-25370 3816
2580 >ON472608.1:21548-25363 3816
2581 >ON472613.1:21555-25370 3816
2582 >ON472623.1:21532-25347 3816
2583 >ON472632.1:21546-25361 3816
2584 >ON472633.1:21525-25340 3816
2585 >ON472648.1:21536-25351 3816
2586 >ON472652.1:21547-25362 3816
2587 >ON472660.1:21532-25347 3816
2588 >ON472701.1:21532-25347 3816
2589 >ON472716.1:21546-25361 3816
2590 >ON472736.1:21525-25340 3816
2591 >ON472738.1:21557-25372 3816
2592 >ON472745.1:21528-25343 3816
2593 >ON472750.1:21552-25367 3816
2594 >ON472753

2988 >OM665264.1:21551-25363 3813
2989 >OM666991.1:21501-25313 3813
2990 >OM673230.1:21522-25334 3813
2991 >OM673940.1:21520-25332 3813
2992 >OM683528.1:21526-25338 3813
2993 >OM683774.1:21522-25334 3813
2994 >OM685714.1:21551-25363 3813
2995 >OM685732.1:21551-25363 3813
2996 >OM685900.1:21551-25363 3813
2997 >OM685904.1:21551-25363 3813
2998 >OM686032.1:21551-25363 3813
2999 >OM686108.1:21551-25363 3813
3000 >OM686313.1:21549-25361 3813
3001 >OM686493.1:21549-25361 3813
3002 >OM686783.1:21549-25361 3813
3003 >OM690656.1:21497-25309 3813
3004 >OM690657.1:21497-25309 3813
3005 >OM690670.1:21497-25309 3813
3006 >OM693439.1:21419-25231 3813
3007 >OM693548.1:21512-25324 3813
3008 >OM693965.1:21416-25228 3813
3009 >OM694673.1:21512-25324 3813
3010 >OM694919.1:21479-25291 3813
3011 >OM696320.1:21449-25261 3813
3012 >OM696621.1:21449-25261 3813
3013 >OM699311.1:21551-25363 3813
3014 >OM699319.1:21549-25361 3813
3015 >OM699377.1:21551-25363 3813
3016 >OM699494.1:21551-25363 3813
3017 >OM699739

3422 >OL947528.1:21538-25353 3816
3423 >OL947567.1:21538-25353 3816
3424 >OL947588.1:21538-25353 3816
3425 >OL947935.1:21533-25348 3816
3426 >OL948731.1:21538-25353 3816
3427 >OL948846.1:21538-25353 3816
3428 >OL949811.1:21538-25353 3816
3429 >OL949984.1:21538-25353 3816
3430 >OL949985.1:21532-25347 3816
3431 >OL950153.1:21538-25353 3816
3432 >OL950273.1:21538-25353 3816
3433 >OL950289.1:21532-25347 3816
3434 >OL950452.1:21538-25353 3816
3435 >OL950585.1:21532-25347 3816
3436 >OL950600.1:21535-25350 3816
3437 >OL950659.1:21538-25353 3816
3438 >OL950704.1:21523-25338 3816
3439 >OL950734.1:21532-25347 3816
3440 >OL950739.1:21538-25353 3816
3441 >OL951657.1:21538-25353 3816
3442 >OL952070.1:21538-25353 3816
3443 >OL952072.1:21538-25353 3816
3444 >OL952193.1:21538-25353 3816
3445 >OL952197.1:21538-25359 3822
3446 >OL952242.1:21532-25347 3816
3447 >OL952379.1:21538-25353 3816
3448 >OL962187.1:21509-25324 3816
3449 >OL962253.1:21509-25324 3816
3450 >OL962297.1:21509-25324 3816
3451 >OL962339

3870 >OM469705.1:21509-25324 3816
3871 >OM484093.1:21539-25351 3813
3872 >OM484141.1:21536-25348 3813
3873 >OM484146.1:21548-25360 3813
3874 >OM484183.1:21548-25360 3813
3875 >OM492834.1:21526-25338 3813
3876 >OM493296.1:21520-25332 3813
3877 >OM497742.1:21416-25228 3813
3878 >OM497825.1:21449-25261 3813
3879 >OM497833.1:21479-25291 3813
3880 >OM499633.1:21449-25261 3813
3881 >OM499750.1:21479-25291 3813
3882 >OM510576.1:21449-25261 3813
3883 >OM517381.1:21522-25334 3813
3884 >OM517730.1:21526-25338 3813
3885 >OM518845.1:21526-25338 3813
3886 >OM519697.1:21449-25261 3813
3887 >OM519704.1:21449-25261 3813
3888 >OM519706.1:21449-25261 3813
3889 >OM519707.1:21479-25291 3813
3890 >OM519754.1:21416-25228 3813
3891 >OM520074.1:21449-25261 3813
3892 >OM520083.1:21416-25228 3813
3893 >OM520084.1:21479-25291 3813
3894 >OM520091.1:21479-25291 3813
3895 >OM520092.1:21451-25263 3813
3896 >OM520097.1:21479-25291 3813
3897 >OM520098.1:21479-25291 3813
3898 >OM520099.1:21479-25291 3813
3899 >OM520137

4309 >OL918002.1:21461-25276 3816
4310 >OL918179.1:21524-25339 3816
4311 >OL920845.1:21538-25353 3816
4312 >OL921061.1:21538-25353 3816
4313 >OL921361.1:21538-25353 3816
4314 >OL921372.1:21538-25353 3816
4315 >OL922141.1:21538-25353 3816
4316 >OL922426.1:21538-25353 3816
4317 >OL922442.1:21538-25353 3816
4318 >OL922901.1:21531-25346 3816
4319 >OL922988.1:21538-25353 3816
4320 >OL923913.1:21538-25353 3816
4321 >OL923916.1:21538-25353 3816
4322 >OL925916.1:21538-25353 3816
4323 >OL926000.1:21538-25353 3816
4324 >OL926035.1:21538-25353 3816
4325 >OL926142.1:21538-25353 3816
4326 >OL926237.1:21538-25353 3816
4327 >OL926284.1:21538-25353 3816
4328 >OL926485.1:21538-25353 3816
4329 >OL926651.1:21538-25353 3816
4330 >OL926662.1:21538-25353 3816
4331 >OL926807.1:21538-25353 3816
4332 >OL927612.1:21523-25338 3816
4333 >OL927976.1:21538-25353 3816
4334 >OL928015.1:21538-25353 3816
4335 >OL928021.1:21538-25353 3816
4336 >OL928046.1:21538-25353 3816
4337 >OL928048.1:21538-25353 3816
4338 >OL928052

4742 >MZ918155.1:21529-25350 3822
4743 >MZ918158.1:21525-25346 3822
4744 >MZ918171.1:21525-25346 3822
4745 >MZ918174.1:21525-25346 3822
4746 >MZ918180.1:21559-25380 3822
4747 >MZ918182.1:21526-25347 3822
4748 >MZ918187.1:21525-25346 3822
4749 >MZ918189.1:21525-25346 3822
4750 >MZ918200.1:21524-25345 3822
4751 >MZ918203.1:21524-25345 3822
4752 >MZ918209.1:21514-25335 3822
4753 >MZ918216.1:21530-25351 3822
4754 >MZ918221.1:21525-25346 3822
4755 >MZ918229.1:21525-25346 3822
4756 >MZ918260.1:21522-25343 3822
4757 >MZ918318.1:21525-25346 3822
4758 >MZ918321.1:21554-25375 3822
4759 >MZ918327.1:21525-25346 3822
4760 >MZ918341.1:21525-25346 3822
4761 >MZ918345.1:21525-25346 3822
4762 >MZ918348.1:21556-25377 3822
4763 >MZ918368.1:21524-25345 3822
4764 >MZ918391.1:21528-25349 3822
4765 >MZ918406.1:21559-25380 3822
4766 >MZ918414.1:21551-25372 3822
4767 >MZ918424.1:21528-25349 3822
4768 >MZ918428.1:21525-25346 3822
4769 >MZ918434.1:21525-25346 3822
4770 >MZ918462.1:21528-25349 3822
4771 >MZ918476

5169 >MZ906518.1:21525-25346 3822
5170 >MZ906524.1:21563-25384 3822
5171 >MZ906531.1:21547-25368 3822
5172 >MZ906541.1:21532-25353 3822
5173 >MZ906553.1:21529-25350 3822
5174 >MZ906554.1:21528-25349 3822
5175 >MZ906565.1:21528-25349 3822
5176 >MZ906568.1:21563-25384 3822
5177 >MZ906569.1:21563-25384 3822
5178 >MZ906571.1:21563-25384 3822
5179 >MZ906572.1:21563-25384 3822
5180 >MZ906573.1:21563-25384 3822
5181 >MZ906575.1:21563-25384 3822
5182 >MZ906587.1:21533-25354 3822
5183 >MZ906594.1:21551-25372 3822
5184 >MZ906611.1:21562-25383 3822
5185 >MZ906619.1:21538-25359 3822
5186 >MZ906621.1:21560-25381 3822
5187 >MZ906622.1:21554-25375 3822
5188 >MZ906625.1:21552-25373 3822
5189 >MZ906630.1:21560-25381 3822
5190 >MZ906632.1:21553-25374 3822
5191 >MZ906634.1:21525-25346 3822
5192 >MZ906635.1:21559-25380 3822
5193 >MZ906642.1:21535-25356 3822
5194 >MZ906659.1:21559-25380 3822
5195 >MZ906669.1:21556-25377 3822
5196 >MZ906676.1:21552-25373 3822
5197 >MZ906688.1:21556-25377 3822
5198 >MZ906691

5612 >MZ505916.1:21533-25354 3822
5613 >MZ505917.1:21533-25354 3822
5614 >MZ505918.1:21533-25354 3822
5615 >MZ505919.1:21533-25354 3822
5616 >MZ505921.1:21533-25354 3822
5617 >MZ505922.1:21533-25354 3822
5618 >MZ505923.1:21533-25354 3822
5619 >MZ505924.1:21533-25354 3822
5620 >MZ505925.1:21528-25349 3822
5621 >MZ505926.1:21533-25354 3822
5622 >MZ505927.1:21528-25349 3822
5623 >MZ505928.1:21533-25354 3822
5624 >MZ505929.1:21533-25354 3822
5625 >MZ505930.1:21533-25354 3822
5626 >MZ505931.1:21533-25354 3822
5627 >MZ505932.1:21533-25354 3822
5628 >MZ505933.1:21527-25348 3822
5629 >MZ505934.1:21533-25354 3822
5630 >MZ505935.1:21533-25354 3822
5631 >MZ505936.1:21533-25354 3822
5632 >MZ505938.1:21533-25354 3822
5633 >MZ505939.1:21533-25354 3822
5634 >MZ505941.1:21533-25354 3822
5635 >MZ505942.1:21533-25354 3822
5636 >MZ505943.1:21533-25354 3822
5637 >MZ505944.1:21533-25354 3822
5638 >MZ505945.1:21533-25354 3822
5639 >MZ505946.1:21533-25354 3822
5640 >MZ505947.1:21533-25354 3822
5641 >MZ505948

6051 >MZ433800.1:21533-25354 3822
6052 >MZ433801.1:21533-25354 3822
6053 >MZ433802.1:21524-25336 3813
6054 >MZ433804.1:21533-25354 3822
6055 >MZ433805.1:21533-25354 3822
6056 >MZ433806.1:21533-25354 3822
6057 >MZ433808.1:21533-25354 3822
6058 >MZ433809.1:21533-25354 3822
6059 >MZ433810.1:21533-25354 3822
6060 >MZ433811.1:21533-25354 3822
6061 >MZ433812.1:21533-25354 3822
6062 >MZ433813.1:21533-25354 3822
6063 >MZ433814.1:21533-25354 3822
6064 >MZ433819.1:21533-25354 3822
6065 >MZ433820.1:21533-25354 3822
6066 >MZ433821.1:21533-25354 3822
6067 >MZ433824.1:21533-25354 3822
6068 >MZ433825.1:21533-25354 3822
6069 >MZ433826.1:21530-25351 3822
6070 >MZ433827.1:21533-25354 3822
6071 >MZ433828.1:21533-25354 3822
6072 >MZ433829.1:21524-25345 3822
6073 >MZ433830.1:21533-25354 3822
6074 >MZ433831.1:21533-25354 3822
6075 >MZ433832.1:21526-25347 3822
6076 >MZ433833.1:21533-25354 3822
6077 >MZ433834.1:21533-25354 3822
6078 >MZ433836.1:21533-25354 3822
6079 >MZ433837.1:21533-25354 3822
6080 >MZ433838

6484 >MW640090.1:21509-25330 3822
6485 >MW640172.1:21509-25330 3822
6486 >MW643614.1:21509-25330 3822
6487 >MW643893.1:21509-25330 3822
6488 >MW644281.1:21509-25330 3822
6489 >MW667393.1:21509-25330 3822
6490 >MW667428.1:21509-25330 3822
6491 >MW667440.1:21509-25330 3822
6492 >MW667483.1:21509-25330 3822
6493 >MW667495.1:21509-25330 3822
6494 >MW669521.1:21509-25330 3822
6495 >MW672748.1:21551-25372 3822
6496 >MW672749.1:21533-25354 3822
6497 >MW672886.1:21528-25349 3822
6498 >MW673017.1:21533-25354 3822
6499 >MW673046.1:21552-25373 3822
6500 >MW673170.1:21551-25372 3822
6501 >MW673340.1:21528-25349 3822
6502 >MW673365.1:21527-25348 3822
6503 >MW673428.1:21526-25347 3822
6504 >MW694049.1:21533-25354 3822
6505 >MW694093.1:21529-25350 3822
6506 >MW696361.1:21509-25330 3822
6507 >MW696371.1:21509-25330 3822
6508 >MW696491.1:21509-25330 3822
6509 >MW696745.1:21509-25330 3822
6510 >MW698098.1:21509-25330 3822
6511 >MW707783.1:21509-25330 3822
6512 >MW707852.1:21509-25330 3822
6513 >MW708773

In [23]:
from Bio import SeqIO
input_file = 'cds_S_TX.fna'
records = SeqIO.parse(input_file, 'fasta')
records = list(records)
records[0:5]

[SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR016361.1:21552-25361', name='OR016361.1:21552-25361', description='OR016361.1:21552-25361 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/TX-CDC-VSX-A348471/2023]', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR075662.1:21504-25313', name='OR075662.1:21504-25313', description='OR075662.1:21504-25313 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/TX-CDC-QDX80414699/2023]', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR075797.1:21504-25313', name='OR075797.1:21504-25313', description='OR075797.1:21504-25313 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/TX-CDC-QDX80512776/2023]', dbxrefs=[]),
 SeqRecord

In [24]:
input_file = 'cds_S_AZ.fna'
records = SeqIO.parse(input_file, 'fasta')
records = list(records)
records[0:5]

[SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR517517.1:21504-25310', name='OR517517.1:21504-25310', description='OR517517.1:21504-25310 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/AZ-CDC-QDX84063420/2023]', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR558475.1:21504-25316', name='OR558475.1:21504-25316', description='OR558475.1:21504-25316 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/AZ-CDC-QDX84335740/2023]', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR558600.1:21504-25313', name='OR558600.1:21504-25313', description='OR558600.1:21504-25313 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/AZ-CDC-QDX84335734/2023]', dbxrefs=[]),
 SeqRecord

In [25]:
input_file = 'cds_S_US.fna'
records = SeqIO.parse(input_file, 'fasta')
records = list(records)
records[0:5]

[SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR306471.1:21527-25348', name='OR306471.1:21527-25348', description='OR306471.1:21527-25348 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/Mayo2147/2020]', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR306526.1:21527-25348', name='OR306526.1:21527-25348', description='OR306526.1:21527-25348 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/Mayo2202/2020]', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTT...TAA'), id='OR306536.1:21525-25346', name='OR306536.1:21525-25346', description='OR306536.1:21525-25346 surface glycoprotein [organism=Severe acute respiratory syndrome coronavirus 2] [isolate=SARS-CoV-2/human/USA/Mayo2212/2020]', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGTTTGTTTTTCTTGTTTT

## Alignment of Sequences

In [123]:
from Bio import AlignIO
from Bio import SeqIO
from Bio import Seq
import os

#### Method 2: Using Muscle

In [None]:
#Executed in terminal
#muscle -align cds_S.fna  -output aligned.fasta

Output####

$ muscle -align cds_S.fna  -output aligned.fasta

muscle 5.1.linux64 []  131Gb RAM, 16 cores
Built Feb 24 2022 03:16:15
(C) Copyright 2004-2021 Robert C. Edgar.
https://drive5.com

Input: 4627 seqs, avg length 3812, max 3825


WARNING: >1k sequences, may be slow or use excessive memory, consider using -super5

00:07 1.1Gb  CPU has 16 cores, running 16 threads
^C:18 11.0Gb  0.0012% Calc posteriors

In [82]:
#Used  -super5 as suggested in the above warning
#muscle -super5 cds_S_US.fna -output aligned_US.fasta

$ muscle -super5 cds_S.fna -output aligned.fasta

muscle 5.1.linux64 []  131Gb RAM, 16 cores
Built Feb 24 2022 03:16:15
(C) Copyright 2004-2021 Robert C. Edgar.
https://drive5.com

Input: 4627 seqs, length avg 3812 max 3825

00:01 38Mb    100.0% Derep 1952 uniques, 2674 dupes
00:01 40Mb   CPU has 16 cores, running 16 threads                    
01:02:33 69Mb    100.0% UCLUST 1953 seqs EE<0.01, 16 centroids, 1936 members
01:03:17 197Mb   100.0% UCLUST 16 seqs EE<0.30, 1 centroids, 14 members     
01:03:20 197Mb   100.0% Make cluster MFAs                              
1 clusters pass 1                        
1 clusters pass 2
01:03:20 197Mb  
01:03:20 197Mb  Align cluster 1 / 1 (16 seqs)
01:03:20 197Mb  
01:12:54 5.9Gb   100.0% Calc posteriors
01:13:18 1.6Gb   100.0% Consistency (1/2)
01:13:28 1.4Gb   100.0% Consistency (2/2)
01:13:28 1.2Gb   100.0% UPGMA5           
01:13:34 1.2Gb   100.0% Refining
01:13:34 1.2Gb   100.0% Consensus sequences
Inserting 2674 dupes... done.

In [28]:
alignment_US = AlignIO.read('aligned_S_US.fasta', "fasta")
alignment_AZ = AlignIO.read('aligned_S_AZ.fasta', "fasta")
alignment_TX = AlignIO.read('aligned_S_TX.fasta', "fasta")
print(alignment_US)

Alignment with 5957 rows and 3981 columns
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA OR956036.1:21462-25274
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTTTCTAGTCAGTG...TAA OM965473.1:21495-25307
ATGTTTGTTTTTCTTGTTTTNNNNNNNNNNGTCTCTAGTCAGTG...TAA MZ835113.2:21533-25354
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA OM972000.1:21497-25309
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTTTCTAGTCAGTG...TAA OP864250.1:21501-25310
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTATTCAGTG...TAA MW578209.1:21546-25367
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTATTCAGTG...TAA MW868699.1:21553-25374
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA OR305370.1:21528-25349
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA PP645626.1:21508-25329
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA OM447168.1:21547-25359
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA OK018416.1:21509-25324
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA OM396741.1:21526-25338
ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTG...TAA OR307244.1:21527-25

In [29]:
len(alignment_AZ),len(alignment_TX),len(alignment_US)

(6810, 4272, 5957)

In [30]:
alignment_AZ[0].seq,alignment_TX[0].seq,alignment_US[0].seq

(Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTTTCTAGTCAGTGTGT-------...TAA'),
 Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGT-------...TAA'),
 Seq('ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGT-------...TAA'))

In [105]:
df_AZ= pd.read_csv('AZ_xlist.csv')
df_TX= pd.read_csv('TX_xlist.csv')
df_US= pd.read_csv('US_xlist.csv')

# Order Aligned Sequences Chronologically

In [53]:
def extract_dates_and_accessions(input_pattern, aligned_sequence_file):
    # Find all files matching the input pattern
    files = glob.glob(input_pattern)
    
    # Create an empty list to store results
    result_list = []
    
    for file in files:
        # Read the CSV file
        df = pd.read_csv(file)
        
        # Check if the required columns exist in the DataFrame
        if 'Collection_Date' in df.columns and 'Accession' in df.columns:
            # Extract the columns and rename them
            extracted_df = df[['Collection_Date', 'Accession']].rename(columns={'Collection_Date': 'Date'})
            
            # Convert the 'Date' column to datetime format, coerce errors to NaT
            extracted_df['Date'] = pd.to_datetime(extracted_df['Date'], format='%Y-%m-%d', errors='coerce')
            
            # Drop rows with NaT in the 'Date' column
            extracted_df = extracted_df.dropna(subset=['Date'])
            
            # Append the extracted data to the result list if it's not empty
            if not extracted_df.empty:
                result_list.append(extracted_df)
    
    # Concatenate all non-empty DataFrames in the result list
    if result_list:
        result_df = pd.concat(result_list, ignore_index=True)
    # Sort the DataFrame by the 'Date' column
        result_df = result_df.sort_values(by='Date').reset_index(drop=True)
           
        
        # Save the Date alone to a new CSV file
        output_date_file = "dates_" + os.path.splitext(os.path.basename(aligned_sequence_file))[0] + ".csv"
        result_df['Date'].to_csv(output_date_file, index=False)
        print(f"Dates saved to {output_date_file}")
        
        # Load the sequences from the sequence file
        alignment = list(SeqIO.parse(aligned_sequence_file, "fasta"))
        alignment_dict = {record.id.split(':')[0]: record for record in alignment}
        
        # Filter the DataFrame to only include rows with matching sequences
        result_df = result_df[result_df['Accession'].isin(alignment_dict.keys())]
        
        # Order the sequences according to the sorted accessions
        Align_ordered = [alignment_dict[acc] for acc in result_df['Accession']]
        
        # Convert ordered sequences to a list of strings
        Align_ordered_str = [str(record.seq) for record in Align_ordered]
        
        result_df['Seq'] = Align_ordered_str
        output_file = "dat_Acces_Seq_" + os.path.splitext(os.path.basename(aligned_sequence_file))[0] + ".csv"
        
        # Save the result dates and accession numbers to a new CSV file
        result_df.to_csv(output_file, index=False)
        print(f"Data saved to {output_file}")
        
        
    else:
        print("No valid data to save.")


In [54]:

# Example usage
input_pattern = "AZ_202*_sequences_*.csv"
sequence_file = "aligned_S_AZ.fasta"  # Replace with your actual sequence file
extract_dates_and_accessions(input_pattern, sequence_file)


Dates saved to dates_aligned_S_AZ.csv
Data saved to dat_Acces_Seq_aligned_S_AZ.csv


In [55]:

# Example usage
input_pattern = "TX_202*_sequences_*.csv"
sequence_file = "aligned_S_TX.fasta"  # Replace with your actual sequence file
extract_dates_and_accessions(input_pattern, sequence_file)


Dates saved to dates_aligned_S_TX.csv
Data saved to dat_Acces_Seq_aligned_S_TX.csv


In [56]:

# Example usage
input_pattern = "US_202*_sequences_*.csv"
sequence_file = "aligned_S_US.fasta"  # Replace with your actual sequence file
extract_dates_and_accessions(input_pattern, sequence_file)


Dates saved to dates_aligned_S_US.csv
Data saved to dat_Acces_Seq_aligned_S_US.csv


In [45]:
alignment_US = AlignIO.read('ordered_aligned_S_US.fasta', "fasta")
alignment_AZ = AlignIO.read('ordered_aligned_S_AZ.fasta', "fasta")
alignment_TX = AlignIO.read('ordered_aligned_S_TX.fasta', "fasta")

In [46]:
alignment_AZ[0:5],alignment_TX[0:5],alignment_US[0:5]

(<<class 'Bio.Align.MultipleSeqAlignment'> instance (5 records of length 3903) at 13a3a82c0>,
 <<class 'Bio.Align.MultipleSeqAlignment'> instance (5 records of length 3888) at 13a3aa7b0>,
 <<class 'Bio.Align.MultipleSeqAlignment'> instance (5 records of length 3981) at 13a3a8110>)

# Distance computation

In [59]:
import numpy as np
from IPython.display import clear_output

In [60]:
def get_distance(x, y):
    return sum(1 for ele_x, ele_y in zip(x, y) if ele_x != 'N' and ele_y != 'N' and ele_x != '-' and ele_y != '-' and ele_x != ele_y)
 

def calculate_distance_matrix(csv_file, output_file):
    """
    Calculate the pairwise distance matrix for the given alignment data and save the result.

    Parameters:
    alignment_file (str): The path to the .npy file containing the alignment data.
    output_file (str): The path to the .npy file where the resulting distance matrix will be saved.
    """
    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Extract the sequences
    sequences = df['Seq'].tolist()
    n = len(sequences)

    # Initialize the distance matrix
    dis_array_ordered = np.zeros((n, n))
    
    # Calculate the pairwise distances
    for i, ele_1 in enumerate(sequences):
        t = i / n
        clear_output(wait=True)  
        print(t, flush=True)
        for j, ele_2 in enumerate(sequences):
            if j >= i:
                break  # Since the matrix is symmetrical, we don't need to calculate everything
            distance = get_distance(ele_1, ele_2)  
            dis_array_ordered[i, j] = distance
            dis_array_ordered[j, i] = distance

    # Save the resulting distance matrix
    np.save(output_file, dis_array_ordered)

    print(f"Distance matrix saved to {output_file}")

In [None]:
alignment_file='dat_Acces_Seq_aligned_S_TX.csv'#execution queued 09:46:48 2024-07-15 executed in 52m 41s, finished 11:27:56 2024-07-12
output_file='dis_ord_alig_TX.npy'
calculate_distance_matrix(alignment_file, output_file)

In [None]:
alignment_file='dat_Acces_Seq_aligned_S_US.csv'#execution queued 09:46:48 2024-07-15 executed in 52m 41s, finished 11:27:56 2024-07-12
output_file='dis_ord_alig_US.npy'
calculate_distance_matrix(alignment_file, output_file)

In [None]:
alignment_file='dat_Acces_Seq_aligned_S_AZ.csv'#execution queued 09:46:48 2024-07-15 executed in 52m 41s, finished 11:27:56 2024-07-12
output_file='dis_ord_alig_AZ.npy'
calculate_distance_matrix(alignment_file, output_file)

# Full Code

In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from Bio import SeqIO
from Bio import AlignIO
import os
from IPython.display import clear_output
def extract_dates_and_accessionss(input_pattern, aligned_sequence_file):
    # Find all files matching the input pattern
    files = glob.glob(input_pattern)
    
    # Create an empty list to store results
    result_list = []
    
    for file in files:
        # Read the CSV file
        df = pd.read_csv(file)
        
        # Check if the required columns exist in the DataFrame
        if 'Collection_Date' in df.columns and 'Accession' in df.columns:
            # Extract the columns and rename them
            extracted_df = df[['Collection_Date', 'Accession']].rename(columns={'Collection_Date': 'Date'})
            
            # Convert the 'Date' column to datetime format, coerce errors to NaT
            extracted_df['Date'] = pd.to_datetime(extracted_df['Date'], format='%Y-%m-%d', errors='coerce')
            
            # Drop rows with NaT in the 'Date' column
            extracted_df = extracted_df.dropna(subset=['Date'])
            
            # Append the extracted data to the result list if it's not empty
            if not extracted_df.empty:
                result_list.append(extracted_df)
    
    # Concatenate all non-empty DataFrames in the result list
    if result_list:
        result_df = pd.concat(result_list, ignore_index=True)
        
        
       
        # Load the sequences from the sequence file
        alignment = list(SeqIO.parse(aligned_sequence_file, "fasta"))
        alignment_dict = {record.id.split(':')[0]: record for record in alignment}
        
        # Filter the DataFrame to only include rows with matching sequences
        result_df = result_df[result_df['Accession'].isin(alignment_dict.keys())]
        
        # Order the sequences according to the sorted accessions
        Align_ordered = [alignment_dict[acc] for acc in result_df['Accession']]
        
        # Convert ordered sequences to a list of strings
        Align_ordered_str = [str(record.seq) for record in Align_ordered]
        
        result_df['Seq'] = Align_ordered_str
        output_file = "dat_Acces_Seq_" + os.path.splitext(os.path.basename(aligned_sequence_file))[0] + ".csv"
        
        # Save the result dates and accession numbers to a new CSV file
        result_df.to_csv(output_file, index=False)
        print(f"Data saved to {output_file}")
        # Save the Date alone to a new CSV file
        output_date_file = "dates_" + os.path.splitext(os.path.basename(aligned_sequence_file))[0] + ".csv"
        result_df['Date'].to_csv(output_date_file, index=False)
        print(f"Dates saved to {output_date_file}")
        
        
        
    else:
        print("No valid data to save.")



def get_distance(x, y):
    return sum(1 for ele_x, ele_y in zip(x, y) if ele_x != 'N' and ele_y != 'N' and ele_x != '-' and ele_y != '-' and ele_x != ele_y)
 

def calculate_distance_matrix(csv_file, output_file):
    """
    Calculate the pairwise distance matrix for the given alignment data and save the result.

    Parameters:
    csv_file (str): The path to the .csv file containing the dates, accession numbers and aligned seqs.
    output_file (str): The path to the .npy file where the resulting distance matrix will be saved.
    """
    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Extract the sequences
    sequences = df['Seq'].tolist()
    n = len(sequences)

    # Initialize the distance matrix
    dis_array_ordered = np.zeros((n, n))
    
    # Calculate the pairwise distances
    for i, ele_1 in enumerate(sequences):
        t = i / n
        clear_output(wait=True)  
        print(t, flush=True)
        for j, ele_2 in enumerate(sequences):
            if j >= i:
                break  # Since the matrix is symmetrical, we don't need to calculate everything
            distance = get_distance(ele_1, ele_2)  
            dis_array_ordered[i, j] = distance
            dis_array_ordered[j, i] = distance

    # Save the resulting distance matrix
    np.save(output_file, dis_array_ordered)

    print(f"Distance matrix saved to {output_file}")

In [90]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from Bio import SeqIO
from Bio import AlignIO
import os
from IPython.display import clear_output

def extract_dates_and_accessions(input_pattern, aligned_sequence_file):
    # Find all files matching the input pattern
    files = glob.glob(input_pattern)
    
    # Create an empty list to store results
    result_list = []
    
    for file in files:
        # Read the CSV file
        df = pd.read_csv(file)
        
        # Check if the required columns exist in the DataFrame
        if 'Collection_Date' in df.columns and 'Accession' in df.columns:
            # Extract the columns and rename them
            extracted_df = df[['Collection_Date', 'Accession']].rename(columns={'Collection_Date': 'Date'})
            
            # Convert the 'Date' column to datetime format, coerce errors to NaT
            extracted_df['Date'] = pd.to_datetime(extracted_df['Date'], format='%Y-%m-%d', errors='coerce')
            
            # Drop rows with NaT in the 'Date' column
            extracted_df = extracted_df.dropna(subset=['Date'])
            
            # Append the extracted data to the result list if it's not empty
            if not extracted_df.empty:
                result_list.append(extracted_df)
    
    # Concatenate all non-empty DataFrames in the result list
    if result_list:
        result_df = pd.concat(result_list, ignore_index=True)
        
        # Sort the DataFrame by the 'Date' column
        result_df = result_df.sort_values(by='Date').reset_index(drop=True)
        
        
        # Load the sequences from the sequence file
        alignment = list(SeqIO.parse(aligned_sequence_file, "fasta"))
        alignment_dict = {record.id.split(':')[0]: record for record in alignment}
        
        # Filter the DataFrame to only include rows with matching sequences
        result_df = result_df[result_df['Accession'].isin(alignment_dict.keys())]
        
        # Order the sequences according to the sorted accessions
        Align_ordered = [alignment_dict[acc] for acc in result_df['Accession']]
        
        # Convert ordered sequences to a list of strings
        Align_ordered_str = [str(record.seq) for record in Align_ordered]
        
        result_df['Seq'] = Align_ordered_str
        output_file = "dat_Acces_Seq_" + os.path.splitext(os.path.basename(aligned_sequence_file))[0] + ".csv"
        
        # Save the result dates and accession numbers to a new CSV file
        result_df.to_csv(output_file, index=False)
        #print(f"Data saved to {output_file}")
        
        # Save the Date alone to a new CSV file
        output_date_file = "dates_" + os.path.splitext(os.path.basename())[0] + ".csv"
        result_df['Date'].to_csv(output_date_file, index=False)
        #print(f"Dates saved to {output_date_file}")
       
        
    else:
        print("No valid data to save.")

def get_distance(x, y):
    return sum(1 for ele_x, ele_y in zip(x, y) if ele_x != 'N' and ele_y != 'N' and ele_x != '-' and ele_y != '-' and ele_x != ele_y)

def calculate_distance_matrix(csv_file, output_file):
    """
    Calculate the pairwise distance matrix for the given alignment data and save the result.

    Parameters:
    csv_file (str): The path to the .csv file containing the dates, accession numbers, and aligned sequences.
    output_file (str): The path to the .npy file where the resulting distance matrix will be saved.
    """
    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Extract the sequences
    sequences = df['Seq'].tolist()
    n = len(sequences)

    # Initialize the distance matrix
    dis_array_ordered = np.zeros((n, n))
    
    # Calculate the pairwise distances
    for i, ele_1 in enumerate(sequences):
        t = i / n
        clear_output(wait=True)  
        print(f"Progress: {t*100:.2f}%", flush=True)
        for j in range(i):
            distance = get_distance(ele_1, sequences[j])  
            dis_array_ordered[i, j] = distance
            dis_array_ordered[j, i] = distance

    # Save the resulting distance matrix
    np.save(output_file, dis_array_ordered)
    #print(f"Distance matrix saved to {output_file}")


In [91]:
def get_dist_date_files(input_pattern,aligned_sequence_fasta_file):
    extract_dates_and_accessions(input_pattern,aligned_sequence_fasta_file)
    csv_file="dat_Acces_Seq_" + os.path.splitext(os.path.basename(aligned_sequence_fasta_file))[0] + ".csv"
    # Save the Date alone to a CSV file
    output_date_file = "dates_" + os.path.splitext(os.path.basename(aligned_sequence_fasta_file))[0] + ".csv"
    print(f"Dates saved to {output_date_file}")  
    # Save the Distance matrix to a .npy file
    dist_output_file = "dist_" + os.path.splitext(os.path.basename(aligned_sequence_fasta_file))[0] + ".npy"
    calculate_distance_matrix(csv_file, dist_output_file)
    print(f"Distance matrix saved to {dist_output_file}")

In [84]:
input_pattern = "TX_202*_sequences_*.csv" #execution queued 10:10:59 2024-07-15 executed in 50m 31s, finished 11:01:30 2024-07-15
aligned_sequence_fasta_file = "aligned_S_TX.fasta" # execution queued 11:38:25 2024-07-15 executed in 49m 39s, finished 12:28:04 2024-07-15
get_dist_date_files(input_pattern,aligned_sequence_fasta_file)

Progress: 99.98%
Distance matrix saved to dist_aligned_S_TX.npy
Distance matrix saved to dist_aligned_S_TX.npy


In [89]:
input_pattern = "US_202*_sequences_*.csv" #execution queued 10:10:59 2024-07-15
aligned_sequence_fasta_file = "aligned_S_US.fasta" # executed in 1h 35m 45s, finished 14:20:14 2024-07-15
get_dist_date_files(input_pattern,aligned_sequence_fasta_file)

Progress: 99.98%
Distance matrix saved to dist_aligned_S_US.npy
Distance matrix saved to dist_aligned_S_US.npy


In [92]:
input_pattern = "AZ_202*_sequences_*.csv" #execution queued 14:24:17 2024-07-15
aligned_sequence_fasta_file = "aligned_S_AZ.fasta" # executed in 1h 59m 55s, finished 16:24:12 2024-07-15
get_dist_date_files(input_pattern,aligned_sequence_fasta_file)

Progress: 99.99%
Distance matrix saved to dist_aligned_S_AZ.npy
