# Pandas, quick introduction

In [2]:
import pandas as pd

# Pandas introduces a tabular data structure, the DataFrame

In [3]:
df = pd.DataFrame(
    data = [['Anthony', 28, 1.53], ['Maria', 31, 1.76], ['Emma', 26, 1.83], ['Philip', 41, 1.81], ['Bill', 27, None]],
    columns = ['name', 'age', 'height'],
    index=['A484', 'C012', 'A123', 'B663', 'A377'],
)

In [4]:
df

Unnamed: 0,name,age,height
A484,Anthony,28,1.53
C012,Maria,31,1.76
A123,Emma,26,1.83
B663,Philip,41,1.81
A377,Bill,27,


In [5]:
df.head(3)

Unnamed: 0,name,age,height
A484,Anthony,28,1.53
C012,Maria,31,1.76
A123,Emma,26,1.83


In [6]:
df.sample(3)

Unnamed: 0,name,age,height
C012,Maria,31,1.76
A484,Anthony,28,1.53
A377,Bill,27,


## DataFrame attributes

In [7]:
df.shape

(5, 3)

In [8]:
# Each column can be a different dtype
# All dtypes are native data types, as in NumPy
df.dtypes

name       object
age         int64
height    float64
dtype: object

In [9]:
df.columns

Index(['name', 'age', 'height'], dtype='object')

In [10]:
df.index

Index(['A484', 'C012', 'A123', 'B663', 'A377'], dtype='object')

## Indexing rows and columns

In [11]:
# Default indexing is by column
df['age']

A484    28
C012    31
A123    26
B663    41
A377    27
Name: age, dtype: int64

In [12]:
# Use a list to select multiple columns (like in NumPy's fancy indexing)
df[['age', 'name']]

Unnamed: 0,age,name
A484,28,Anthony
C012,31,Maria
A123,26,Emma
B663,41,Philip
A377,27,Bill


In [13]:
# Indexing by row / column name
df.loc['A484', 'height']

1.53

In [14]:
# Indexing by element position like in NumPy (it's a bit of a smell)
df.iloc[0, 2]

1.53

## Examining a column

In [15]:
df['height'].describe()

count    4.000000
mean     1.732500
std      0.138173
min      1.530000
25%      1.702500
50%      1.785000
75%      1.815000
max      1.830000
Name: height, dtype: float64

In [16]:
df['name'].unique()

array(['Anthony', 'Maria', 'Emma', 'Philip', 'Bill'], dtype=object)

In [17]:
df['name'].nunique()

5

# Filtering

In [18]:
df[df['age'] > 30]

Unnamed: 0,name,age,height
C012,Maria,31,1.76
B663,Philip,41,1.81


In [19]:
is_old_and_tall = (df['age'] > 30) & (df['height'] > 1.8)
df[is_old_and_tall]

Unnamed: 0,name,age,height
B663,Philip,41,1.81


# Basic operations are by column (unlike NumPy)

In [20]:
df['age'].min()

26

In [21]:
df.min()

name      Anthony
age            26
height       1.53
dtype: object

In [67]:
# Note that Pandas operations ignore NaNs (they consider them as "missing")
df.mean()

  df.mean()


age       30.6000
height     1.7325
dtype: float64

In [60]:
df.mean(numeric_only=True)

age       30.6000
height     1.7325
dtype: float64

In [77]:
# Operations that change the order of the rows keep the index and column labels intact
df.sort_values('name', axis=0)

Unnamed: 0,name,age,height
A484,Anthony,28,1.53
A377,Bill,27,
A123,Emma,26,1.83
C012,Maria,31,1.76
B663,Philip,41,1.81


In [64]:
df

Unnamed: 0,name,age,height
A484,Anthony,28,1.53
C012,Maria,31,1.76
A123,Emma,26,1.83
B663,Philip,41,1.81
A377,Bill,27,


# Operations on strings

In [72]:
# Use `.str` to access string operations
# Third character of each name
df['name'].str[2]

A484    t
C012    r
A123    m
B663    i
A377    l
Name: name, dtype: object

In [78]:
# Third character of each name
df['name'].str.upper()

A484    ANTHONY
C012      MARIA
A123       EMMA
B663     PHILIP
A377       BILL
Name: name, dtype: object

In [81]:
df['name'].str.count('a')

A484    0
C012    2
A123    1
B663    0
A377    0
Name: name, dtype: int64

In [83]:
df['name'].str.lower().str.count('a')

A484    1
C012    2
A123    1
B663    0
A377    0
Name: name, dtype: int64

# Adding new columns

In [84]:
df

Unnamed: 0,name,age,height
A484,Anthony,28,1.53
C012,Maria,31,1.76
A123,Emma,26,1.83
B663,Philip,41,1.81
A377,Bill,27,


In [85]:
df['name_upper'] = df['name'].str.upper()

In [86]:
df

Unnamed: 0,name,age,height,name_upper
A484,Anthony,28,1.53,ANTHONY
C012,Maria,31,1.76,MARIA
A123,Emma,26,1.83,EMMA
B663,Philip,41,1.81,PHILIP
A377,Bill,27,,BILL
