# Data Manipulation

This notebook goes through the process of cleaning and analysing data on world plastic waste.

## Imports

In [1]:
import pandas as pd

## Data sourcing

In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
country_data = pd.read_csv('./data/plastics.csv')

In [4]:
# Show the first rows of the dataframe (default 5)

country_data.head(1)

Unnamed: 0,variables,code,PCapita plastic waste (kg per person per day),gdp_pc,otalpopulatio,year,Mismanaged plastic waste (tonnes),coastal_pop,variables.1
0,Albania,ALB,0.069,$9927.18184117512,3204284,2011,29705,2530533,Albania


In [5]:
# Show the last rows of the dataframe (default 5)

country_data.tail(2)

Unnamed: 0,variables,code,PCapita plastic waste (kg per person per day),gdp_pc,otalpopulatio,year,Mismanaged plastic waste (tonnes),coastal_pop,variables.1
184,Vietnam,VNM,0.103,$4408.16861192198,87848445,2011,1833819,55858245,Vietnam
185,Yemen,YEM,0.103,$4478.7435991429,NOT KNOWN,2011,169181,6048920,Yemen


In [6]:
# Show a random row and it defaults to 1

country_data.sample(2)

Unnamed: 0,variables,code,PCapita plastic waste (kg per person per day),gdp_pc,otalpopulatio,year,Mismanaged plastic waste (tonnes),coastal_pop,variables.1
10,Bangladesh,BGD,0.034,$2442.72888765696,148692131,2011,787327,70874124,Bangladesh
152,Sierra Leone,SLE,0.041,$1199.98985027502,5867536,2011,36408,2887017,Sierra Leone


In [7]:
# Check the type - there is a data frame structure that lives inside pandas

type(country_data)

pandas.core.frame.DataFrame

# Data cleaning

### Fixing Columns

In [8]:
country_data.columns

Index(['variables', 'code', 'PCapita plastic waste (kg per person per day)',
       'gdp_pc', 'otalpopulatio', 'year', 'Mismanaged plastic waste (tonnes)',
       'coastal_pop', 'variables.1'],
      dtype='object')

**By default, pandas always edits a copy of your data. Overwrite it to keep the changes.**

In [9]:
country_data = country_data.rename(columns={
    "variables": "country",
    "PCapita plastic waste (kg per person per day)": "plastic_pc",
    "otalpopulatio": "total_pop",
    "Mismanaged plastic waste (tonnes)": "plastic_mm"
})

### Drop unwanted columns

In [10]:
country_data["year"].value_counts()

2011    186
Name: year, dtype: int64

In [11]:
(country_data["country"] == country_data["variables.1"]).sum()

186

In [12]:
country_data = country_data.drop(["variables.1", "year"], axis=1)

In [13]:
country_data.sample()

Unnamed: 0,country,code,plastic_pc,gdp_pc,total_pop,plastic_mm,coastal_pop
55,Finland,FIN,0.234,$39848.1344978775,5364546,4985,2927674


### Types

In [14]:
country_data.dtypes

country         object
code            object
plastic_pc     float64
gdp_pc          object
total_pop       object
plastic_mm       int64
coastal_pop      int64
dtype: object

In [15]:
country_data["total_pop"]

0        3204284
1       35468208
2       19081912
3          15358
4          88710
         ...    
181      3368786
182       239651
183     28979857
184     87848445
185    NOT KNOWN
Name: total_pop, Length: 186, dtype: object

In [16]:
country_data["gdp_pc"]

0       $9927.18184117512
1       $12870.6026985154
2      $5897.682840598482
3                     NaN
4       $19212.7201307541
              ...        
181     $17082.4046586765
182     $2948.03293729183
183     $16544.9720552598
184     $4408.16861192198
185      $4478.7435991429
Name: gdp_pc, Length: 186, dtype: object

In [17]:
# Filtering out any row with 'NOT KNOWN' in the total_pop column

not_known_filter = country_data["total_pop"] != "NOT KNOWN"

country_data = country_data[not_known_filter]

In [18]:
# Filter out any row with a NaN value

country_data = country_data.dropna()

In [21]:
country_data.shape

(145, 7)

In [22]:
country_data.head()

Unnamed: 0,country,code,plastic_pc,gdp_pc,total_pop,plastic_mm,coastal_pop
0,Albania,ALB,0.069,$9927.18184117512,3204284,29705,2530533
1,Algeria,DZA,0.144,$12870.6026985154,35468208,520555,16556580
2,Angola,AGO,0.062,$5897.682840598482,19081912,62528,3790041
4,Antigua and Barbuda,ATG,0.66,$19212.7201307541,88710,1253,66843
5,Argentina,ARG,0.183,$18712.063077343602,40412376,157777,16449245


### String stuff

In [26]:
country_data["gdp_pc"] = country_data["gdp_pc"].str.replace("$", "", regex=False)

## Changing types

In [28]:
country_data.dtypes

country         object
code            object
plastic_pc     float64
gdp_pc          object
total_pop       object
plastic_mm       int64
coastal_pop      int64
dtype: object

In [29]:
country_data["gdp_pc"] = pd.to_numeric(country_data["gdp_pc"])
country_data["total_pop"] = pd.to_numeric(country_data["total_pop"])

In [30]:
country_data.dtypes

country         object
code            object
plastic_pc     float64
gdp_pc         float64
total_pop        int64
plastic_mm       int64
coastal_pop      int64
dtype: object

In [31]:
country_data.head()

Unnamed: 0,country,code,plastic_pc,gdp_pc,total_pop,plastic_mm,coastal_pop
0,Albania,ALB,0.069,9927.181841,3204284,29705,2530533
1,Algeria,DZA,0.144,12870.602699,35468208,520555,16556580
2,Angola,AGO,0.062,5897.682841,19081912,62528,3790041
4,Antigua and Barbuda,ATG,0.66,19212.720131,88710,1253,66843
5,Argentina,ARG,0.183,18712.063077,40412376,157777,16449245


In [32]:
country_data["gdp_pc"] = country_data["gdp_pc"].round(2)

In [33]:
country_data.head()

Unnamed: 0,country,code,plastic_pc,gdp_pc,total_pop,plastic_mm,coastal_pop
0,Albania,ALB,0.069,9927.18,3204284,29705,2530533
1,Algeria,DZA,0.144,12870.6,35468208,520555,16556580
2,Angola,AGO,0.062,5897.68,19081912,62528,3790041
4,Antigua and Barbuda,ATG,0.66,19212.72,88710,1253,66843
5,Argentina,ARG,0.183,18712.06,40412376,157777,16449245


In [35]:
country_data.to_csv("./data/clean_plastics.csv", index=False)

## Summarisation

In [37]:
country_data.describe()

Unnamed: 0,plastic_pc,gdp_pc,total_pop,plastic_mm,coastal_pop
count,145.0,145.0,145.0,145.0,145.0
mean,0.197503,19240.419103,43384160.0,213319.5,13432340.0
std,0.314226,20189.229524,154605600.0,824201.1,34764980.0
min,0.01,660.21,9827.0,93.0,11563.0
25%,0.087,4984.19,1341140.0,5480.0,639228.0
50%,0.144,12123.84,6192993.0,21754.0,2820558.0
75%,0.223,29221.99,31671590.0,91571.0,8971770.0
max,3.6,125140.84,1341335000.0,8819717.0,262892400.0


In [45]:
country_data.sort_values("plastic_pc", ascending=False)

Unnamed: 0,country,code,plastic_pc,gdp_pc,total_pop,plastic_mm,coastal_pop
172,Trinidad and Tobago,TTO,3.600,31260.91,1341465,94066,1358433
91,Kuwait,KWT,0.686,75204.15,2736732,11489,2293604
4,Antigua and Barbuda,ATG,0.660,19212.72,88710,1253,66843
143,Saint Kitts and Nevis,KNA,0.654,21412.20,52402,715,36102
74,Guyana,GUY,0.586,5847.86,754493,42002,513235
...,...,...,...,...,...,...,...
19,Brunei,BRN,0.026,80552.86,398920,93,359871
167,Tanzania,TZA,0.023,2090.58,44841226,48586,6688695
98,Madagascar,MDG,0.016,1385.69,20713819,34522,7062413
112,Mozambique,MOZ,0.015,917.76,23390765,45995,9566559
