# Data Wrangling in Python

## Import Packages

In [2]:
import pandas as pd
import os

In [3]:
os.getcwd()

'/Users/dannylumian/Documents/GitHub/DS-Student-Resources/DS104-Data-Wrangling-and-Visualization/Workshops/02-Data-Wrangling-in-Python'

## Load in Data

In [5]:
volcanoes = pd.read_csv("../../Data/volcanoes.csv")

In [6]:
volcanoes.head()

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_start_year,event_number,event_type,event_remarks,event_date_year,event_date_month,event_date_day
0,210020,Chaine des Puys,10011,-4040,100001,Explosion,,,,
1,210020,Chaine des Puys,10011,-4040,100002,Scoria,,,,
2,210020,Chaine des Puys,10011,-4040,100003,Pyroclastic flow,,,,
3,210020,Chaine des Puys,10011,-4040,100004,Lava flow(s),,,,
4,210020,Chaine des Puys,10011,-4040,100005,Cinder cone formation,,,,


In [7]:
volcanoes.event_type.unique()

array(['Explosion', 'Scoria', 'Pyroclastic flow', 'Lava flow(s)',
       'Cinder cone formation', 'Pumice', 'Flames', 'Phreatic activity',
       'Ash', 'Lapilli', 'Lahar or mudflow', 'Bombs',
       'Lava dome formation', 'Phreatomagmatic eruption',
       'Property damage', 'Earthquakes (undefined)', 'Blocks',
       'Lava lake', 'Fatalities', 'Evacuations', 'Lava fountains',
       'Tsunami', 'Caldera formation', 'Eruption cloud',
       'Seismicity (volcanic)', 'Partial collapse at end of eruption',
       'Volcanic tremor', 'Debris avalanches', 'Deformation (inflation)',
       'Deformation (deflation)', 'Loud audible noises',
       'Deformation (undefined)', 'Glow', 'Fauna kill',
       'Edifice destroyed', 'Lightning', 'Tephra', 'Crater formation',
       'Island formation', 'Spine formation', 'Earthquake (tectonic)',
       'Directed explosion', 'Mud', 'Fumarolic or solfataric',
       'Volcanic smoke', 'Liquid sulfur', 'Jokulhaup',
       'Fissure formation', 'Water fountain'

In [8]:
volcanoes.volcano_name.unique()[:5]

array(['Chaine des Puys', 'Calatrava Volcanic Field', 'Vulsini',
       'Colli Albani', 'Campi Flegrei'], dtype=object)

In [10]:
print(volcanoes.volcano_number.min())
print(volcanoes.volcano_number.max())

210010
600000


## Add a New Column Filled with a Value

In [11]:
volcanoes['ErruptionYN'] = 'Y'

In [12]:
volcanoes.head()

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_start_year,event_number,event_type,event_remarks,event_date_year,event_date_month,event_date_day,ErruptionYN
0,210020,Chaine des Puys,10011,-4040,100001,Explosion,,,,,Y
1,210020,Chaine des Puys,10011,-4040,100002,Scoria,,,,,Y
2,210020,Chaine des Puys,10011,-4040,100003,Pyroclastic flow,,,,,Y
3,210020,Chaine des Puys,10011,-4040,100004,Lava flow(s),,,,,Y
4,210020,Chaine des Puys,10011,-4040,100005,Cinder cone formation,,,,,Y


## Renaming Columns

In [13]:
volcanoes.rename(columns={'event_remarks' : 'notes'}, inplace=True)

In [14]:
volcanoes.head()

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_start_year,event_number,event_type,notes,event_date_year,event_date_month,event_date_day,ErruptionYN
0,210020,Chaine des Puys,10011,-4040,100001,Explosion,,,,,Y
1,210020,Chaine des Puys,10011,-4040,100002,Scoria,,,,,Y
2,210020,Chaine des Puys,10011,-4040,100003,Pyroclastic flow,,,,,Y
3,210020,Chaine des Puys,10011,-4040,100004,Lava flow(s),,,,,Y
4,210020,Chaine des Puys,10011,-4040,100005,Cinder cone formation,,,,,Y


## Combining Columns

Data must be string and the same type to use the following code

### Check the Data Types

In [15]:
volcanoes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41322 entries, 0 to 41321
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   volcano_number       41322 non-null  int64  
 1   volcano_name         41322 non-null  object 
 2   eruption_number      41322 non-null  int64  
 3   eruption_start_year  41322 non-null  int64  
 4   event_number         41322 non-null  int64  
 5   event_type           41322 non-null  object 
 6   notes                4880 non-null   object 
 7   event_date_year      10007 non-null  float64
 8   event_date_month     7132 non-null   float64
 9   event_date_day       5923 non-null   float64
 10  ErruptionYN          41322 non-null  object 
dtypes: float64(3), int64(4), object(4)
memory usage: 3.5+ MB


In [16]:
volcanoes['NameType'] = volcanoes['volcano_name'] + "," + volcanoes['event_type'].map(str)

In [17]:
volcanoes.head()

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_start_year,event_number,event_type,notes,event_date_year,event_date_month,event_date_day,ErruptionYN,NameType
0,210020,Chaine des Puys,10011,-4040,100001,Explosion,,,,,Y,"Chaine des Puys,Explosion"
1,210020,Chaine des Puys,10011,-4040,100002,Scoria,,,,,Y,"Chaine des Puys,Scoria"
2,210020,Chaine des Puys,10011,-4040,100003,Pyroclastic flow,,,,,Y,"Chaine des Puys,Pyroclastic flow"
3,210020,Chaine des Puys,10011,-4040,100004,Lava flow(s),,,,,Y,"Chaine des Puys,Lava flow(s)"
4,210020,Chaine des Puys,10011,-4040,100005,Cinder cone formation,,,,,Y,"Chaine des Puys,Cinder cone formation"


## Separating Columns

In [18]:
volcanoes1 = volcanoes['NameType'].str.split(',', expand=True)

In [19]:
volcanoes1.head()

Unnamed: 0,0,1,2
0,Chaine des Puys,Explosion,
1,Chaine des Puys,Scoria,
2,Chaine des Puys,Pyroclastic flow,
3,Chaine des Puys,Lava flow(s),
4,Chaine des Puys,Cinder cone formation,


In [20]:
volcanoes2 = pd.concat([volcanoes, volcanoes1], axis=1)

In [21]:
volcanoes2.head()

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_start_year,event_number,event_type,notes,event_date_year,event_date_month,event_date_day,ErruptionYN,NameType,0,1,2
0,210020,Chaine des Puys,10011,-4040,100001,Explosion,,,,,Y,"Chaine des Puys,Explosion",Chaine des Puys,Explosion,
1,210020,Chaine des Puys,10011,-4040,100002,Scoria,,,,,Y,"Chaine des Puys,Scoria",Chaine des Puys,Scoria,
2,210020,Chaine des Puys,10011,-4040,100003,Pyroclastic flow,,,,,Y,"Chaine des Puys,Pyroclastic flow",Chaine des Puys,Pyroclastic flow,
3,210020,Chaine des Puys,10011,-4040,100004,Lava flow(s),,,,,Y,"Chaine des Puys,Lava flow(s)",Chaine des Puys,Lava flow(s),
4,210020,Chaine des Puys,10011,-4040,100005,Cinder cone formation,,,,,Y,"Chaine des Puys,Cinder cone formation",Chaine des Puys,Cinder cone formation,


In [22]:
volcanoes2.rename(columns={0 : "Name", 1 : 'Type'}, inplace=True)

In [23]:
volcanoes2.head()

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_start_year,event_number,event_type,notes,event_date_year,event_date_month,event_date_day,ErruptionYN,NameType,Name,Type,2
0,210020,Chaine des Puys,10011,-4040,100001,Explosion,,,,,Y,"Chaine des Puys,Explosion",Chaine des Puys,Explosion,
1,210020,Chaine des Puys,10011,-4040,100002,Scoria,,,,,Y,"Chaine des Puys,Scoria",Chaine des Puys,Scoria,
2,210020,Chaine des Puys,10011,-4040,100003,Pyroclastic flow,,,,,Y,"Chaine des Puys,Pyroclastic flow",Chaine des Puys,Pyroclastic flow,
3,210020,Chaine des Puys,10011,-4040,100004,Lava flow(s),,,,,Y,"Chaine des Puys,Lava flow(s)",Chaine des Puys,Lava flow(s),
4,210020,Chaine des Puys,10011,-4040,100005,Cinder cone formation,,,,,Y,"Chaine des Puys,Cinder cone formation",Chaine des Puys,Cinder cone formation,


## Dropping Columns

In [24]:
volcanoes2.drop([2], axis=1, inplace = True)

In [25]:
volcanoes2.head()

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_start_year,event_number,event_type,notes,event_date_year,event_date_month,event_date_day,ErruptionYN,NameType,Name,Type
0,210020,Chaine des Puys,10011,-4040,100001,Explosion,,,,,Y,"Chaine des Puys,Explosion",Chaine des Puys,Explosion
1,210020,Chaine des Puys,10011,-4040,100002,Scoria,,,,,Y,"Chaine des Puys,Scoria",Chaine des Puys,Scoria
2,210020,Chaine des Puys,10011,-4040,100003,Pyroclastic flow,,,,,Y,"Chaine des Puys,Pyroclastic flow",Chaine des Puys,Pyroclastic flow
3,210020,Chaine des Puys,10011,-4040,100004,Lava flow(s),,,,,Y,"Chaine des Puys,Lava flow(s)",Chaine des Puys,Lava flow(s)
4,210020,Chaine des Puys,10011,-4040,100005,Cinder cone formation,,,,,Y,"Chaine des Puys,Cinder cone formation",Chaine des Puys,Cinder cone formation


## Keeping Columns

In [26]:
volcanoes3 = volcanoes2[['Name', 'Type']]

In [27]:
volcanoes3.head()

Unnamed: 0,Name,Type
0,Chaine des Puys,Explosion
1,Chaine des Puys,Scoria
2,Chaine des Puys,Pyroclastic flow
3,Chaine des Puys,Lava flow(s)
4,Chaine des Puys,Cinder cone formation


## Subsetting Rows

In [28]:
volcanoes4 = volcanoes3[:2]

In [29]:
volcanoes4.head()

Unnamed: 0,Name,Type
0,Chaine des Puys,Explosion
1,Chaine des Puys,Scoria
