In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
import urllib
from bs4 import BeautifulSoup
import json

# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Tree&id=57723&lvl=3&lin=f&keep=1&srchmode=1&unlock
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/


# EoL Mammals Data from Costa Rica

In [173]:
df = pd.read_csv('https://raw.githubusercontent.com/vlachner2/Sunburst-D3/master/MammaliaCR.tsv',sep='\t')
df = df.iloc[:, :-1]
df

Unnamed: 0,Taxon URL,Ancestry,Scientific Name,Common Name,Author Name
0,/pages/14083,Life | Cellular | Eukaryota | Opisthokonta | M...,Dasyprocta,Common agouti,
1,/pages/34548,Life | Cellular | Eukaryota | Opisthokonta | M...,Bos Linnaeus 1758,ox,Linnaeus
2,/pages/118008,Life | Cellular | Eukaryota | Opisthokonta | M...,<i>Sylvilagus</i> (<i>Tapeti</i>) <i>brasilien...,Tapeti,Linnaeus
3,/pages/122355,Life | Cellular | Eukaryota | Opisthokonta | M...,<i>Caluromys</i> (<i>Mallodelphys</i>) <i>derb...,Central American woolly opossum,Waterhouse
4,/pages/126665,Life | Cellular | Eukaryota | Opisthokonta | M...,<i>Vampyrodes caraccioli</i> (Thomas 1889),Great Stripe-faced Bat,Thomas
...,...,...,...,...,...
290,/pages/46559444,Life | Cellular | Eukaryota | Opisthokonta | M...,<i>Megaptera novaeangliae</i> (Borowski 1781),Humpback Whale,Borowski
291,/pages/47048443,Life | Cellular | Eukaryota | Opisthokonta | M...,<i>Micoureus alstoni</i> (J. A. Allen 1900),Alstons mouse opossum,J. A. Allen
292,/pages/47050396,Life | Cellular | Eukaryota | Opisthokonta | M...,<i>Reithrodontomys</i> (<i>Aporodon</i>) <i>mu...,,Gardner; Carleton
293,/pages/47050514,Life | Cellular | Eukaryota | Opisthokonta | M...,<i>Oecomys concolor</i> (Wagner 1845),Natterer's Oecomys,Wagner


In [174]:
print(df.shape)

(295, 5)


In [175]:
unique_sname = df['Scientific Name'].unique()
print(type(unique_sname))
print(len(unique_sname))

<class 'numpy.ndarray'>
295


In [176]:
df_col = df.columns
for i in df_col:
    print(i, len(df[i].unique()))

Taxon URL 295
Ancestry 168
Scientific Name 295
Common Name 251
Author Name 97


In [177]:
# kingdom, division (phylum), class, order, family, genus, species, subspecies, varieties
ancestry_df = df['Ancestry'].str.split("|",expand=True,)
ancestry_df = pd.concat([ancestry_df, df['Scientific Name']], axis=1)

# Remove HTML
ancestry_df['Scientific Name'] = ancestry_df['Scientific Name'].str.replace(r'<i>', '')
ancestry_df['Scientific Name'] = ancestry_df['Scientific Name'].str.replace(r'</i>', '')

ancestry_df = ancestry_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
ancestry_df = ancestry_df.loc[ ancestry_df.iloc[:,17] == "Mammalia" ]
ancestry_df = ancestry_df.iloc[:, 17:]
ancestry_df

Unnamed: 0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,Scientific Name
0,Mammalia,Theria,Eutheria,Placentalia,Boreoeutheria,Euarchontoglires,Glires,Rodentia,Ctenohystrica,Hystricognathi,Caviomorpha,Cavioidea,Dasyproctidae,,,,,Dasyprocta
1,Mammalia,Theria,Eutheria,Placentalia,Boreoeutheria,Laurasiatheria,Scrotifera,Cetartiodactyla,Ruminantia,Bovidae,Bovinae,,,,,,,Bos Linnaeus 1758
2,Mammalia,Theria,Eutheria,Placentalia,Boreoeutheria,Euarchontoglires,Glires,Lagomorpha,Leporidae,Sylvilagus,,,,,,,,Sylvilagus (Tapeti) brasiliensis (Linnaeus 1758)
3,Mammalia,Theria,Metatheria,Marsupialia,Didelphimorphia,Didelphidae,Caluromys,,,,,,,,,,,Caluromys (Mallodelphys) derbianus (Waterhouse...
4,Mammalia,Theria,Eutheria,Placentalia,Boreoeutheria,Laurasiatheria,Scrotifera,Chiroptera,Phyllostomidae,Vampyrodes,,,,,,,,Vampyrodes caraccioli (Thomas 1889)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,Mammalia,Theria,Eutheria,Placentalia,Boreoeutheria,Laurasiatheria,Scrotifera,Cetartiodactyla,Cetacea,Mysticeti,Balaenopteridae,Megaptera,,,,,,Megaptera novaeangliae (Borowski 1781)
291,Mammalia,Theria,Metatheria,Marsupialia,Didelphimorphia,Didelphidae,Micoureus,,,,,,,,,,,Micoureus alstoni (J. A. Allen 1900)
292,Mammalia,Theria,Eutheria,Placentalia,Boreoeutheria,Euarchontoglires,Glires,Rodentia,Mouse relatives,Myomorpha,Muroidea,Eumuroidea,Cricetidae,Neotominae,Reithrodontomyini,Reithrodontomys,,Reithrodontomys (Aporodon) musseri Gardner & C...
293,Mammalia,Theria,Eutheria,Placentalia,Boreoeutheria,Euarchontoglires,Glires,Rodentia,Mouse relatives,Myomorpha,Muroidea,Eumuroidea,Cricetidae,Sigmodontinae,Oryzomyalia,Oryzomyini,Oecomys,Oecomys concolor (Wagner 1845)


In [178]:
def create_entries(df):
  entries = []
  for i in range(df.shape[0]):
    if (df.iloc[i, 0] == None) or (df.shape[1] <= 2):
      entries.append({
        "name": df.iloc[i, -1],
        "value": 1
      })
  values = set(df.iloc[:, 0])  # Getting the set of unique values
  if  (df.shape[1] != 2):
    for v in values:
      if v != None:
        entries.append(
            {"name": v,
            # getting children, but without the first column
            # and only the rows with the current value
            "children": create_entries(
                df.loc[df.iloc[:, 0] == v].iloc[:, 1:]
            )}
        )
  return entries

In [179]:
dictionary = {"name": "root",
          "children": create_entries(ancestry_df)}

In [180]:
# Serializing json   
json_object = json.dumps(dictionary, indent = 2)
# Save the DataFrame to a file on the Colab backend.
with open('MammaliaCR.json', 'w') as f:
  f.write(json_object)

# Download the file.
from google.colab import files
files.download('MammaliaCR.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>