# IMPORT DATA FROM HTML

In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

In [2]:
wiki_url  = 'https://en.wikipedia.org/wiki/Genome'
wiki_data = urlopen(wiki_url)
wiki_html = wiki_data.read()
wiki_data.close()

page_soup = soup(wiki_html, 'html.parser')

In [3]:
genome_table = page_soup.find_all('table', class_ = 'wikitable sortable')[0]
print(genome_table)

<table class="wikitable sortable">
<tbody><tr>
<th>Organism type
</th>
<th>Organism
</th>
<th colspan="2">Genome size <br/>(<a href="/wiki/Base_pair" title="Base pair">base pairs</a>)
</th>
<th>Approx. no. of genes
</th>
<th class="unsortable">Note
</th></tr>
<tr>
<td><a href="/wiki/Virus" title="Virus">Virus</a>
</td>
<td><a href="/wiki/Porcine_circovirus" title="Porcine circovirus">Porcine circovirus</a> type 1
</td>
<td align="right">1,759
</td>
<td>1.8kb
</td>
<td>
</td>
<td>Smallest viruses replicating autonomously in <a class="mw-redirect" href="/wiki/Eukaryotic" title="Eukaryotic">eukaryotic</a> cells.<sup class="reference" id="cite_ref-Equinexus_43-0"><a href="#cite_note-Equinexus-43">[43]</a></sup>
</td></tr>
<tr>
<td><a href="/wiki/Virus" title="Virus">Virus</a>
</td>
<td><a href="/wiki/Bacteriophage_MS2" title="Bacteriophage MS2">Bacteriophage MS2</a>
</td>
<td align="right">3,569
</td>
<td>3.5kb
</td>
<td>
</td>
<td>First sequenced RNA-genome<sup class="reference" id="cite_

In [4]:
headers = genome_table.find_all('th')
print(headers)

[<th>Organism type
</th>, <th>Organism
</th>, <th colspan="2">Genome size <br/>(<a href="/wiki/Base_pair" title="Base pair">base pairs</a>)
</th>, <th>Approx. no. of genes
</th>, <th class="unsortable">Note
</th>]


In [31]:
header_titles = []
for header in headers:
    header_titles.append(header.text[:-1])
header_titles.insert(3,'')
print(header_titles)

['Organism type', 'Organism', 'Genome size (base pairs)', '', 'Approx. no. of genes', 'Note']


In [32]:
all_rows = genome_table.find_all('tr')[1:]
data = []
index = 0

for row in all_rows:
    data.append([])
    for cell in row.find_all('td'):
        data[index].append(cell.text[:-1])
    index += 1
        
print(data)

[['Virus', 'Porcine circovirus type 1', '1,759', '1.8kb', '', 'Smallest viruses replicating autonomously in eukaryotic cells.[43]'], ['Virus', 'Bacteriophage MS2', '3,569', '3.5kb', '', 'First sequenced RNA-genome[44]'], ['Virus', 'SV40', '5,224', '5.2kb', '', '[45]'], ['Virus', 'Phage Φ-X174', '5,386', '5.4kb', '', 'First sequenced DNA-genome[46]'], ['Virus', 'HIV', '9,749', '9.7kb', '', '[47]'], ['Virus', 'Phage λ', '48,502', '48.5kb', '', 'Often used as a vector for the cloning of recombinant DNA.\n[48]\n[49]\n[50]\n'], ['Virus', 'Megavirus', '1,259,197', '1.3Mb', '', 'Until 2013 the largest known viral genome.[51]'], ['Virus', 'Pandoravirus salinus', '2,470,000', '2.47Mb', '', 'Largest known viral genome.[52]'], ['Bacterium', 'Nasuia deltocephalinicola (strain NAS-ALF)', '112,091', '112kb', '', 'Smallest non-viral genome.[53]'], ['Bacterium', 'Carsonella ruddii', '159,662', '160kb', '', ''], ['Bacterium', 'Buchnera aphidicola', '600,000', '600kb', '', '[54]'], ['Bacterium', 'Wiggle

In [35]:
clean_data = []

for row in data:
    clean_row = []
    for string in row:
        clean_column = string.replace('\n','')
        clean_column = clean_column.replace(',','')
        clean_row.append(clean_column)
    clean_data.append(clean_row)
print(clean_data)

[['Virus', 'Porcine circovirus type 1', '1759', '1.8kb', '', 'Smallest viruses replicating autonomously in eukaryotic cells.[43]'], ['Virus', 'Bacteriophage MS2', '3569', '3.5kb', '', 'First sequenced RNA-genome[44]'], ['Virus', 'SV40', '5224', '5.2kb', '', '[45]'], ['Virus', 'Phage Φ-X174', '5386', '5.4kb', '', 'First sequenced DNA-genome[46]'], ['Virus', 'HIV', '9749', '9.7kb', '', '[47]'], ['Virus', 'Phage λ', '48502', '48.5kb', '', 'Often used as a vector for the cloning of recombinant DNA.[48][49][50]'], ['Virus', 'Megavirus', '1259197', '1.3Mb', '', 'Until 2013 the largest known viral genome.[51]'], ['Virus', 'Pandoravirus salinus', '2470000', '2.47Mb', '', 'Largest known viral genome.[52]'], ['Bacterium', 'Nasuia deltocephalinicola (strain NAS-ALF)', '112091', '112kb', '', 'Smallest non-viral genome.[53]'], ['Bacterium', 'Carsonella ruddii', '159662', '160kb', '', ''], ['Bacterium', 'Buchnera aphidicola', '600000', '600kb', '', '[54]'], ['Bacterium', 'Wigglesworthia glossinidia'

# WRITE TO CSV

In [44]:
file_name = 'genome_table.csv'
f = open(file_name, 'w')

header_string = ''
for title in header_titles:
    header_string += title + ','
header_string = header_string[:-1] + '\n'

f.write(header_string)

75

In [47]:
for row in clean_data:
    row_string = ''
    for column in row:
        row_string += column + ','
    row_string = row_string[:-1]
    row_string += '\n'
    f.write(row_string)