### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import country_converter as coco

### Loading data

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,named_by,species,link
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,Yates Bonnan Neveling Chinsamy and Blackbeard ...,celestae,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m,Dinosauria Saurischia Theropoda Neotheropoda C...,Bonaparte and Novas (1985),comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,USA,ceratopsian,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,Sampson (1995),horneri,https://www.nhm.ac.uk/discover/dino-directory/...
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,Perle Norell and Clark (1999),giganteus,https://www.nhm.ac.uk/discover/dino-directory/...
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,USA,large theropod,12.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,Stovall and Langston (1950),atokensis,https://www.nhm.ac.uk/discover/dino-directory/...


In [4]:
len(df)

309

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      309 non-null    object
 1   diet      309 non-null    object
 2   period    309 non-null    object
 3   lived_in  308 non-null    object
 4   type      309 non-null    object
 5   length    291 non-null    object
 6   taxonomy  309 non-null    object
 7   named_by  309 non-null    object
 8   species   304 non-null    object
 9   link      309 non-null    object
dtypes: object(10)
memory usage: 24.3+ KB


### Cleaning null values

In [6]:
df.isnull().sum()

name         0
diet         0
period       0
lived_in     1
type         0
length      18
taxonomy     0
named_by     0
species      5
link         0
dtype: int64

In [7]:
len(df.dropna(axis=0))

285

In [8]:
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [9]:
df.head()

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,named_by,species,link
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m,Dinosauria Saurischia Sauropodomorpha Prosauro...,Yates Bonnan Neveling Chinsamy and Blackbeard ...,celestae,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m,Dinosauria Saurischia Theropoda Neotheropoda C...,Bonaparte and Novas (1985),comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,USA,ceratopsian,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Ma...,Sampson (1995),horneri,https://www.nhm.ac.uk/discover/dino-directory/...
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,Perle Norell and Clark (1999),giganteus,https://www.nhm.ac.uk/discover/dino-directory/...
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,USA,large theropod,12.0m,Dinosauria Saurischia Theropoda Neotheropoda T...,Stovall and Langston (1950),atokensis,https://www.nhm.ac.uk/discover/dino-directory/...


### Preprocessing

In [10]:
df.columns

Index(['name', 'diet', 'period', 'lived_in', 'type', 'length', 'taxonomy',
       'named_by', 'species', 'link'],
      dtype='object')

In [11]:
df.nunique()

name        285
diet          5
period      147
lived_in     31
type          6
length       73
taxonomy     98
named_by    255
species     255
link        285
dtype: int64

---

In [12]:
df.diet.unique().tolist()

['herbivorous',
 'carnivorous',
 'omnivorous',
 'unknown',
 'herbivorous/omnivorous']

In [13]:
df[df.diet=='herbivorous/omnivorous']

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,named_by,species,link
221,riojasaurus,herbivorous/omnivorous,Late Triassic 221-210 million years ago,Argentina,sauropod,5.15m,Dinosauria Saurischia Sauropodomorpha Prosauro...,Bonaparte (1969),incertus,https://www.nhm.ac.uk/discover/dino-directory/...


In [14]:
df.iloc[221].diet = 'herbivorous'

In [15]:
df.diet.unique().tolist()

['herbivorous', 'carnivorous', 'omnivorous', 'unknown']

---

In [16]:
df.length.unique()

array(['8.0m', '9.0m', '6.0m', '5.0m', '12.0m', '15.0m', '1.5m', '21.0m',
       '7.0m', '2.0m', '3.5m', '18.0m', '1.3m', '0.5m', '35.0m', '3.0m',
       '2.3m', '1.0m', '14.0m', '24.0m', '10.0m', '30.0m', '23.0m',
       '7.6m', '1.1m', '4.0m', '11.0m', '1.7m', '2.1m', '1.8m', '0.65m',
       '0.25m', '8.1m', '20.0m', '26.0m', '6.4m', '13.0m', '6.2m', '3.4m',
       '4.2m', '4.7m', '0.8m', '4.6m', '12.5m', '8.6m', '21.5m', '1.2m',
       '4.5m', '6.5m', '22.0m', '8.2m', '0.6m', '5.7m', '4.4m', '7.4m',
       '1.6m', '5.3m', '1.4m', '28.0m', '25.0m', '6.8m', '0.9m', '2.4m',
       '4.1m', '2.5m', '5.15m', '6.6m', '0.45m', '2.37m', '5.5m', '8.5m',
       '7.5m', '17.0m'], dtype=object)

In [17]:
df.length = list(map(lambda x: float(x[:-1]), df.length))

---

In [18]:
df.type.unique()

array(['sauropod', 'large theropod', 'ceratopsian', 'euornithopod',
       'small theropod', 'armoured dinosaur'], dtype=object)

In [19]:
# df.period.unique()

In [20]:
def process_period(period):
    period_name = ''
    for c in range(len(period)):
        if period[c].isdigit():
            break
        else:
            period_name += period[c]
    period_name = period_name.strip()
    return period_name

In [21]:
df.period = list(map(lambda x: process_period(x), df.period))

In [22]:
df.period.unique()

array(['Early Jurassic', 'Late Cretaceous', 'Early Cretaceous',
       'Late Jurassic', 'Mid Jurassic', 'Late Triassic'], dtype=object)

---

In [23]:
df

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,named_by,species,link
0,aardonyx,herbivorous,Early Jurassic,South Africa,sauropod,8.0,Dinosauria Saurischia Sauropodomorpha Prosauro...,Yates Bonnan Neveling Chinsamy and Blackbeard ...,celestae,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,carnivorous,Late Cretaceous,Argentina,large theropod,9.0,Dinosauria Saurischia Theropoda Neotheropoda C...,Bonaparte and Novas (1985),comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,herbivorous,Late Cretaceous,USA,ceratopsian,6.0,Dinosauria Ornithischia Genasauria Cerapoda Ma...,Sampson (1995),horneri,https://www.nhm.ac.uk/discover/dino-directory/...
3,achillobator,carnivorous,Late Cretaceous,Mongolia,large theropod,5.0,Dinosauria Saurischia Theropoda Neotheropoda T...,Perle Norell and Clark (1999),giganteus,https://www.nhm.ac.uk/discover/dino-directory/...
4,acrocanthosaurus,carnivorous,Early Cretaceous,USA,large theropod,12.0,Dinosauria Saurischia Theropoda Neotheropoda T...,Stovall and Langston (1950),atokensis,https://www.nhm.ac.uk/discover/dino-directory/...
...,...,...,...,...,...,...,...,...,...,...
280,yinlong,herbivorous,Mid Jurassic,China,ceratopsian,1.2,Dinosauria Ornithischia Genasauria Cerapoda Ma...,Xu Forster Clark and Mo (2006),downsi,https://www.nhm.ac.uk/discover/dino-directory/...
281,yuanmousaurus,herbivorous,Mid Jurassic,China,sauropod,17.0,Dinosauria Saurischia Sauropodomorpha Sauropod...,Lü Li Ji Wang Zhang and Dong (2006),jiangyiensis,https://www.nhm.ac.uk/discover/dino-directory/...
282,yunnanosaurus,omnivorous,Early Jurassic,China,sauropod,7.0,Dinosauria Saurischia Sauropodomorpha Prosauro...,Young (1942),huangi,https://www.nhm.ac.uk/discover/dino-directory/...
283,zalmoxes,herbivorous,Late Cretaceous,Romania,euornithopod,3.0,Dinosauria Ornithischia Genasauria Cerapoda Or...,Nopcsa (1902),robustus,https://www.nhm.ac.uk/discover/dino-directory/...


In [24]:
cols_to_drop = ['taxonomy', 'named_by', 'link']

In [25]:
df.drop(cols_to_drop, axis=1, inplace=True)

In [26]:
df

Unnamed: 0,name,diet,period,lived_in,type,length,species
0,aardonyx,herbivorous,Early Jurassic,South Africa,sauropod,8.0,celestae
1,abelisaurus,carnivorous,Late Cretaceous,Argentina,large theropod,9.0,comahuensis
2,achelousaurus,herbivorous,Late Cretaceous,USA,ceratopsian,6.0,horneri
3,achillobator,carnivorous,Late Cretaceous,Mongolia,large theropod,5.0,giganteus
4,acrocanthosaurus,carnivorous,Early Cretaceous,USA,large theropod,12.0,atokensis
...,...,...,...,...,...,...,...
280,yinlong,herbivorous,Mid Jurassic,China,ceratopsian,1.2,downsi
281,yuanmousaurus,herbivorous,Mid Jurassic,China,sauropod,17.0,jiangyiensis
282,yunnanosaurus,omnivorous,Early Jurassic,China,sauropod,7.0,huangi
283,zalmoxes,herbivorous,Late Cretaceous,Romania,euornithopod,3.0,robustus


---

In [27]:
cols_to_capitalize = ['name', 'diet', 'type', 'species']

In [28]:
for each in cols_to_capitalize:
    df[each] = list(map(lambda x: x.title(), df[each]))

In [29]:
df.head()

Unnamed: 0,name,diet,period,lived_in,type,length,species
0,Aardonyx,Herbivorous,Early Jurassic,South Africa,Sauropod,8.0,Celestae
1,Abelisaurus,Carnivorous,Late Cretaceous,Argentina,Large Theropod,9.0,Comahuensis
2,Achelousaurus,Herbivorous,Late Cretaceous,USA,Ceratopsian,6.0,Horneri
3,Achillobator,Carnivorous,Late Cretaceous,Mongolia,Large Theropod,5.0,Giganteus
4,Acrocanthosaurus,Carnivorous,Early Cretaceous,USA,Large Theropod,12.0,Atokensis


---
### Generating ISO-3 codes for the countries

In [30]:
country_list = sorted(df.lived_in.unique().tolist())
np.array(country_list)

array(['Antarctica', 'Argentina', 'Australia', 'Brazil', 'Canada',
       'China', 'Egypt', 'France', 'Germany', 'India', 'Japan',
       'Kazakhstan', 'Madagascar', 'Malawi', 'Mongolia', 'Morocco',
       'Niger', 'North Africa', 'Romania', 'Russia', 'South Africa',
       'Spain', 'Switzerland', 'Tanzania', 'Tunisia', 'USA',
       'United Kingdom', 'Uruguay', 'Uzbekistan', 'Wales', 'Zimbabwe'],
      dtype='<U14')

In [31]:
cc = coco.CountryConverter()

In [32]:
iso_codes = cc.convert(country_list, to='ISO3')

North Africa not found in regex
Wales not found in regex


In [33]:
# changing North Africa to South Africa
# changing Wales to United Kingdom

In [34]:
df[df.lived_in=='North Africa']

Unnamed: 0,name,diet,period,lived_in,type,length,species
53,Carcharodontosaurus,Carnivorous,Late Cretaceous,North Africa,Large Theropod,15.0,Saharicus


In [35]:
df[df.lived_in=='Wales']

Unnamed: 0,name,diet,period,lived_in,type,length,species
193,Pantydraco,Herbivorous,Early Jurassic,Wales,Euornithopod,3.0,Caducus


In [36]:
df.at[53, 'lived_in'] = 'South Africa'
df.at[193, 'lived_in'] = 'United Kingdom'

In [37]:
df.lived_in.nunique()

29

In [38]:
df['lived_in_iso'] = cc.pandas_convert(series=df.lived_in, to='ISO3')

In [39]:
df

Unnamed: 0,name,diet,period,lived_in,type,length,species,lived_in_iso
0,Aardonyx,Herbivorous,Early Jurassic,South Africa,Sauropod,8.0,Celestae,ZAF
1,Abelisaurus,Carnivorous,Late Cretaceous,Argentina,Large Theropod,9.0,Comahuensis,ARG
2,Achelousaurus,Herbivorous,Late Cretaceous,USA,Ceratopsian,6.0,Horneri,USA
3,Achillobator,Carnivorous,Late Cretaceous,Mongolia,Large Theropod,5.0,Giganteus,MNG
4,Acrocanthosaurus,Carnivorous,Early Cretaceous,USA,Large Theropod,12.0,Atokensis,USA
...,...,...,...,...,...,...,...,...
280,Yinlong,Herbivorous,Mid Jurassic,China,Ceratopsian,1.2,Downsi,CHN
281,Yuanmousaurus,Herbivorous,Mid Jurassic,China,Sauropod,17.0,Jiangyiensis,CHN
282,Yunnanosaurus,Omnivorous,Early Jurassic,China,Sauropod,7.0,Huangi,CHN
283,Zalmoxes,Herbivorous,Late Cretaceous,Romania,Euornithopod,3.0,Robustus,ROU


---
### Saving the preprocessed data

In [40]:
df.to_csv('data_preprocessed.csv', index=False)

---