In [1]:
import pandas as pd
import numpy as np

# Playground with a sample

In [2]:
people = {
    "first" : ["Karine", "Jeanne", "Eve", "Maceo", "Willow", "Vincent"
              ],
    "last" : ["Dagoury","Dagoury","Dagoury","Dagoury","Dagoury","Dagoury"],
    "email" : ["karine.dagoury@gmail.com", "jeannedgy@gmail.com", "eveceleste@gmail.com", "", "", "vincent.dagoury@gmail.com"],
    "birthYear" : [ 1973, 2002, 2005, 2016, 2011, 1974 ]
}

In [3]:
df = pd.DataFrame(people)

In [4]:
df

Unnamed: 0,first,last,email,birthYear
0,Karine,Dagoury,karine.dagoury@gmail.com,1973
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002
2,Eve,Dagoury,eveceleste@gmail.com,2005
3,Maceo,Dagoury,,2016
4,Willow,Dagoury,,2011
5,Vincent,Dagoury,vincent.dagoury@gmail.com,1974


## Column definition from row dict

In [5]:
df.columns

Index(['first', 'last', 'email', 'birthYear'], dtype='object')

## Renaming columns

In [6]:
df.columns = ['first_name', 'last_name', 'email', 'year']

In [7]:
df

Unnamed: 0,first_name,last_name,email,year
0,Karine,Dagoury,karine.dagoury@gmail.com,1973
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002
2,Eve,Dagoury,eveceleste@gmail.com,2005
3,Maceo,Dagoury,,2016
4,Willow,Dagoury,,2011
5,Vincent,Dagoury,vincent.dagoury@gmail.com,1974


### Using a list comprehension

In [8]:
df.columns = [ x.upper() for x in df.columns ]

In [9]:
df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL,YEAR
0,Karine,Dagoury,karine.dagoury@gmail.com,1973
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002
2,Eve,Dagoury,eveceleste@gmail.com,2005
3,Maceo,Dagoury,,2016
4,Willow,Dagoury,,2011
5,Vincent,Dagoury,vincent.dagoury@gmail.com,1974


### Using str.replace

In [10]:
df.columns = df.columns.str.replace('_', ' ')

In [11]:
df

Unnamed: 0,FIRST NAME,LAST NAME,EMAIL,YEAR
0,Karine,Dagoury,karine.dagoury@gmail.com,1973
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002
2,Eve,Dagoury,eveceleste@gmail.com,2005
3,Maceo,Dagoury,,2016
4,Willow,Dagoury,,2011
5,Vincent,Dagoury,vincent.dagoury@gmail.com,1974


### Rename a single column in place

In [12]:
df.rename(columns={'FIRST NAME': 'first'}, inplace=True)
df

Unnamed: 0,first,LAST NAME,EMAIL,YEAR
0,Karine,Dagoury,karine.dagoury@gmail.com,1973
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002
2,Eve,Dagoury,eveceleste@gmail.com,2005
3,Maceo,Dagoury,,2016
4,Willow,Dagoury,,2011
5,Vincent,Dagoury,vincent.dagoury@gmail.com,1974


## Updating rows

### Updating one column of one filtered row

In [13]:
df = pd.DataFrame(people)

In [14]:
df.loc[df['first'] == 'Karine', 'first']='Edith'
df

Unnamed: 0,first,last,email,birthYear
0,Edith,Dagoury,karine.dagoury@gmail.com,1973
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002
2,Eve,Dagoury,eveceleste@gmail.com,2005
3,Maceo,Dagoury,,2016
4,Willow,Dagoury,,2011
5,Vincent,Dagoury,vincent.dagoury@gmail.com,1974


### Updating one column by a function

In [39]:
df['last'] = df['last'].str.upper()
df

Unnamed: 0,first,last,email,birthYear
0,Karine,DAGOURY,karine.dagoury@gmail.com,1973
1,Jeanne,DAGOURY,jeannedgy@gmail.com,2002
2,Eve,DAGOURY,eveceleste@gmail.com,2005
3,Maceo,DAGOURY,,2016
4,Willow,DAGOURY,,2011
5,Vincent,DAGOURY,vincent.dagoury@gmail.com,1974


In [16]:
df['email'].apply(len)

0    24
1    19
2    20
3     0
4     0
5    25
Name: email, dtype: int64

## Updating data with 'Apply'

### Using a function

In [17]:
# Define a function that verify an email
def verify_email(email):
  return email.upper()

In [18]:
# and add a column 'verified' based on the result
df['verified'] = df['email'].apply(verify_email)
df

Unnamed: 0,first,last,email,birthYear,verified
0,Edith,DAGOURY,karine.dagoury@gmail.com,1973,KARINE.DAGOURY@GMAIL.COM
1,Jeanne,DAGOURY,jeannedgy@gmail.com,2002,JEANNEDGY@GMAIL.COM
2,Eve,DAGOURY,eveceleste@gmail.com,2005,EVECELESTE@GMAIL.COM
3,Maceo,DAGOURY,,2016,
4,Willow,DAGOURY,,2011,
5,Vincent,DAGOURY,vincent.dagoury@gmail.com,1974,VINCENT.DAGOURY@GMAIL.COM


### Using a lambda

In [19]:
df['lower_last'] = df['last'].apply(lambda x: x.lower())
df

Unnamed: 0,first,last,email,birthYear,verified,lower_last
0,Edith,DAGOURY,karine.dagoury@gmail.com,1973,KARINE.DAGOURY@GMAIL.COM,dagoury
1,Jeanne,DAGOURY,jeannedgy@gmail.com,2002,JEANNEDGY@GMAIL.COM,dagoury
2,Eve,DAGOURY,eveceleste@gmail.com,2005,EVECELESTE@GMAIL.COM,dagoury
3,Maceo,DAGOURY,,2016,,dagoury
4,Willow,DAGOURY,,2011,,dagoury
5,Vincent,DAGOURY,vincent.dagoury@gmail.com,1974,VINCENT.DAGOURY@GMAIL.COM,dagoury


### Using a list comprehension

In [20]:
df['lower_first'] = [ x.lower() for x in df['first'] ]
df

Unnamed: 0,first,last,email,birthYear,verified,lower_last,lower_first
0,Edith,DAGOURY,karine.dagoury@gmail.com,1973,KARINE.DAGOURY@GMAIL.COM,dagoury,edith
1,Jeanne,DAGOURY,jeannedgy@gmail.com,2002,JEANNEDGY@GMAIL.COM,dagoury,jeanne
2,Eve,DAGOURY,eveceleste@gmail.com,2005,EVECELESTE@GMAIL.COM,dagoury,eve
3,Maceo,DAGOURY,,2016,,dagoury,maceo
4,Willow,DAGOURY,,2011,,dagoury,willow
5,Vincent,DAGOURY,vincent.dagoury@gmail.com,1974,VINCENT.DAGOURY@GMAIL.COM,dagoury,vincent


### Add uuid for index

In [21]:
import uuid

In [22]:
df = pd.DataFrame(people)
df['uuid'] = df.index.to_series().map(lambda x: uuid.uuid4())
df.set_index(keys='uuid', inplace=True)
df

Unnamed: 0_level_0,first,last,email,birthYear
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c8c97215-6795-40cb-8d10-f1264d50335f,Karine,Dagoury,karine.dagoury@gmail.com,1973
ae114d06-9b01-4dd6-8765-0a30415a507e,Jeanne,Dagoury,jeannedgy@gmail.com,2002
988f3dfc-d233-44e9-8549-869e72b2e5f8,Eve,Dagoury,eveceleste@gmail.com,2005
e282d65d-e632-456b-83cd-866aea32fb63,Maceo,Dagoury,,2016
16ae2870-1da1-4b2c-8ebc-6067c0015aad,Willow,Dagoury,,2011
3a480673-1d1c-44e3-80e5-bcd39597b352,Vincent,Dagoury,vincent.dagoury@gmail.com,1974


# Compute with DataFrame

In [40]:
# Get the len of all emails
df = pd.DataFrame(people)
df['email'].apply(len)

0    24
1    19
2    20
3     0
4     0
5    25
Name: email, dtype: int64

In [41]:
# Using a lambda, get the age of each people
import datetime
df['age'] = df['birthYear'].apply(lambda x: datetime.datetime.now().year - x)
df

Unnamed: 0,first,last,email,birthYear,age
0,Karine,Dagoury,karine.dagoury@gmail.com,1973,47
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002,18
2,Eve,Dagoury,eveceleste@gmail.com,2005,15
3,Maceo,Dagoury,,2016,4
4,Willow,Dagoury,,2011,9
5,Vincent,Dagoury,vincent.dagoury@gmail.com,1974,46


In [42]:
# Number of columns per row
df.apply(len, axis='columns')

0    5
1    5
2    5
3    5
4    5
5    5
dtype: int64

In [43]:
# Number of rows per column
df.apply(len, axis='rows')

first        6
last         6
email        6
birthYear    6
age          6
dtype: int64

In [27]:
df.shape

(6, 5)

In [28]:
# Determine the minimum values of a Series through all columns
df.apply(pd.Series.min)

first            Eve
last         Dagoury
email               
birthYear       1973
age                4
dtype: object

In [29]:
# Same with max
df.apply(pd.Series.max)

first                           Willow
last                           Dagoury
email        vincent.dagoury@gmail.com
birthYear                         2016
age                                 47
dtype: object

In [30]:
# Using lambda
df.apply(lambda x: x.min())

first            Eve
last         Dagoury
email               
birthYear       1973
age                4
dtype: object

# Applymap

In [44]:
# Apply on each value a lambda function
# wich verify type of the value
df = pd.DataFrame(people)
df.applymap(lambda x: len(x) if isinstance(x,str) else '')

Unnamed: 0,first,last,email,birthYear
0,6,7,24,
1,6,7,19,
2,3,7,20,
3,5,7,0,
4,6,7,0,
5,7,7,25,


# Map

In [32]:
# Map substitutes value with other
df = pd.DataFrame(people)
df['first'].map({'Karine':'Edith'})

0    Edith
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
Name: first, dtype: object

In [33]:
df2 = df['first'].replace({'Karine':'Edith'})

In [34]:
df2

0      Edith
1     Jeanne
2        Eve
3      Maceo
4     Willow
5    Vincent
Name: first, dtype: object

# Write the DataFrame to a CSV file

In [35]:
df.to_csv('output.csv', sep=',', encoding='utf-8')

# Write the DataFrame to 'Excel' file

In [36]:
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [45]:
import openpyxl
df.to_excel('output.xlsx', sheet_name='data')

In [46]:
df.head()

Unnamed: 0,first,last,email,birthYear
0,Karine,Dagoury,karine.dagoury@gmail.com,1973
1,Jeanne,Dagoury,jeannedgy@gmail.com,2002
2,Eve,Dagoury,eveceleste@gmail.com,2005
3,Maceo,Dagoury,,2016
4,Willow,Dagoury,,2011
