## Create Dataset

In [68]:
import pandas as pd
from Bio.SeqIO import parse
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [69]:
csv_contents = pd.read_csv("sequences.csv")
fasta_file = open("sequences.fasta")
records = parse(fasta_file,"fasta")

Now, we find the rows in the **sequences.csv** file, which have the entry in the **accession** column in common with the record id of the records in the fasta file, and we create a new column **Sequence** in the original dataframe filling them with the sequences in the records.

In [70]:
for record in records:
    cc = 0
    for index, row in csv_contents.iterrows():
        if row['Accession'] == str(record.id):
            csv_contents.at[cc, 'Sequence'] = str(record.seq)
            break
        cc += 1

Now, we can delete those entries which have **NA** in the Sequence column in the dataframe.

In [71]:
csv_contents = csv_contents[csv_contents.Sequence != 'NA']

Finally writing the altered dataframe into a **final.csv** file.

In [72]:
csv_contents.to_csv('final.csv',encoding='utf-8',index=False)

## Data Visualization

In [73]:
df = csv_contents

Check presence of **null** values

In [74]:
pd.isnull(df).any()

Accession           False
Release_Date        False
Species             False
Length               True
Geo_Location         True
Host                 True
Isolation_Source     True
Collection_Date      True
Sequence             True
dtype: bool

### Geo_Location

In [75]:
df["Geo_Location"].fillna("NA", inplace = True)

Number of Unique **Geo_Locations** in Dataset

In [76]:
df['Geo_Location'].nunique()

101

Number of Datapoints for each **Geo_Location**

In [77]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df['Geo_Location'].value_counts())

USA: WA                        452
USA                            201
USA: VA                         58
USA: UT                         54
USA: ID                         28
USA: MA                         19
USA: CT                         18
Spain                           16
China: Wuhan                    16
China: Zhejiang, Hangzhou       15
USA: GA                         15
China                           14
Iran                            11
China: Hubei, Wuhan             11
USA: CA                         10
Spain: Valencia                 10
China: Beijing                  10
China: HuaShang                 10
USA: PA                          9
Hong Kong                        9
USA: MI                          9
USA: IL                          9
USA: FL                          8
USA: MN                          8
USA: San Francisco, CA           7
USA: AZ                          7
China: Shenzhen                  7
USA: SC                          7
USA: NC             

### Release_Date

Number of Unique **Release_Dates**

In [78]:
df['Release_Date'].nunique()

55

Number of Datapoints for each **Release Date**

In [79]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df['Release_Date'].value_counts())

2020-04-13T00:00:00Z    261
2020-04-20T00:00:00Z    120
2020-03-31T00:00:00Z    112
2020-04-16T00:00:00Z    112
2020-04-06T00:00:00Z    100
2020-03-30T00:00:00Z     69
2020-04-14T00:00:00Z     52
2020-03-26T00:00:00Z     44
2020-04-17T00:00:00Z     42
2020-04-11T00:00:00Z     33
2020-03-27T00:00:00Z     24
2020-03-09T00:00:00Z     22
2020-04-07T00:00:00Z     20
2020-01-24T00:00:00Z     15
2020-02-12T00:00:00Z     13
2020-04-08T00:00:00Z     13
2020-03-13T00:00:00Z     11
2020-03-10T00:00:00Z     11
2020-04-03T00:00:00Z     11
2020-02-20T00:00:00Z     10
2020-03-24T00:00:00Z      9
2020-02-05T00:00:00Z      8
2020-03-12T00:00:00Z      8
2020-02-14T00:00:00Z      5
2020-01-28T00:00:00Z      5
2020-03-16T00:00:00Z      5
2020-03-02T00:00:00Z      5
2020-01-29T00:00:00Z      5
2020-02-11T00:00:00Z      4
2020-02-01T00:00:00Z      4
2020-03-04T00:00:00Z      4
2020-02-28T00:00:00Z      4
2020-01-31T00:00:00Z      3
2020-02-07T00:00:00Z      3
2020-02-24T00:00:00Z      3
2020-03-17T00:00:00Z