# Example data transformations in Python using Iris Dataset

In [1]:
from IPython.display import VimeoVideo
# Tutorial Video Name: data transformations in Python using Iris Dataset
VimeoVideo('644139540', width=720, height=480)

https://vimeo.com/644139540

This notebook will use a build in seaborn dataset-the iris dataset-to highlight basic data transformations in Python.

Overview:
* Load data
* Inspect data
* Groupby operations
* Subset data
* Recode data

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [4]:
iris.shape

(150, 5)

In [5]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## Groupby functions

In [7]:
gby = iris.groupby('species')['sepal_length'].mean()

In [8]:
gby.head()

species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

In [9]:
gby2 = iris.groupby('species').mean()

In [10]:
gby2

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [35]:
gby3 = iris.groupby('species').count()

In [36]:
gby3

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,sepal_length_recode,sepal_width_recode
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
setosa,50,50,50,50,50,50
versicolor,50,50,50,50,50,50
virginica,50,50,50,50,50,50


## Subset data

In [11]:
setosa = iris[iris['species']=='setosa']

In [12]:
print(f'Shape of full iris dataset: {iris.shape}\n')
print(f'Shape of subset setosa dataset: {setosa.shape}\n')

Shape of full iris dataset: (150, 5)

Shape of subset setosa dataset: (50, 5)



In [13]:
setosa.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [14]:
sepal_len_gt_5 = iris[iris['sepal_length']>5.0]

In [15]:
print(f'Shape of full iris dataset: {iris.shape}\n')
print(f'Shape of subset sepal len > 5.0 dataset: {sepal_len_gt_5.shape}\n')

Shape of full iris dataset: (150, 5)

Shape of subset sepal len > 5.0 dataset: (118, 5)



## Create a subset with only setosa and virginica

In [16]:
setosa_only = iris[iris['species']=='setosa']
print(setosa_only.shape)

(50, 5)


In [17]:
virginica_only = iris[iris['species']=='virginica']
print(virginica_only.shape)

(50, 5)


### Merge the two dataframes

In [18]:
merged_df = pd.concat([setosa_only, virginica_only])

In [19]:
merged_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [20]:
merged_df.reset_index(inplace=True, drop=True)

In [21]:
merged_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
95,6.7,3.0,5.2,2.3,virginica
96,6.3,2.5,5.0,1.9,virginica
97,6.5,3.0,5.2,2.0,virginica
98,6.2,3.4,5.4,2.3,virginica


## Alternative Methods

In [22]:
new_df = iris[iris['species'] != 'versicolor']
print(new_df.shape)

(100, 5)


In [23]:
new_df2 = iris[iris['species'].isin(['setosa', 'virginica'])]
print(new_df2.shape)

(100, 5)


## Recode based on mean

### Recode of Sepal Length

In [25]:
sepal_length_mean = iris.sepal_length.mean()
print(sepal_length_mean)

5.843333333333334


In [26]:
iris['sepal_length_recode'] = iris['sepal_length'].apply(lambda x: 1 if x > sepal_length_mean else 0)

In [27]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length_recode
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [28]:
iris['sepal_length_recode'].value_counts()

0    80
1    70
Name: sepal_length_recode, dtype: int64

### Recode of sepal_width

In [30]:
sepal_width_mean = iris.sepal_width.mean()
print(sepal_width_mean)

3.0573333333333337


In [31]:
def recode_sepal_width(series):
    if series > sepal_width_mean:
        return 1
    else:
        return 0

In [32]:
iris['sepal_width_recode'] = iris['sepal_width'].apply(recode_sepal_width)

In [33]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_length_recode,sepal_width_recode
0,5.1,3.5,1.4,0.2,setosa,0,1
1,4.9,3.0,1.4,0.2,setosa,0,0
2,4.7,3.2,1.3,0.2,setosa,0,1
3,4.6,3.1,1.5,0.2,setosa,0,1
4,5.0,3.6,1.4,0.2,setosa,0,1


In [34]:
iris['sepal_width_recode'].value_counts()

0    83
1    67
Name: sepal_width_recode, dtype: int64