# Pandas basic

- `pd.Series` and `pd.DataFrame`

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create our first series
ex_series = pd.Series(["a", 2, np.pi, 36])
print(ex_series)

0           a
1           2
2    3.141593
3          36
dtype: object


In [17]:
# We can print out values only
type(ex_series.values)

numpy.ndarray

In [4]:
# Retrieve index
print(ex_series.index)

RangeIndex(start=0, stop=4, step=1)


In [8]:
# We can use slicing syntax to access particular items
print(ex_series[0:2])

0    a
1    2
dtype: object


There are two other syntax options:
- `.loc[]`: access via index *values*
- `.iloc[]`: access via index *positions*

In [11]:
# Let's create a new Series to showcase .loc vs .iloc
ex_series = pd.Series(
    data = [
        "Mozzarella Caprese",
        "Wiener Schnitzel",
        "Schwarzwalder Kirschtorte",
        "Lemonade",
        "Whiskey"
    ],
    index = ["appetizer", "main course", "dessert", "beverage", "alcohol"]
)

In [12]:
print(ex_series)

appetizer             Mozzarella Caprese
main course             Wiener Schnitzel
dessert        Schwarzwalder Kirschtorte
beverage                        Lemonade
alcohol                          Whiskey
dtype: object


In [18]:
ex_series

appetizer             Mozzarella Caprese
main course             Wiener Schnitzel
dessert        Schwarzwalder Kirschtorte
beverage                        Lemonade
alcohol                          Whiskey
dtype: object

In [19]:
ex_series.loc[["appetizer", "dessert"]]

appetizer           Mozzarella Caprese
dessert      Schwarzwalder Kirschtorte
dtype: object

In [22]:
ex_series.iloc[[1, 2]]

main course             Wiener Schnitzel
dessert        Schwarzwalder Kirschtorte
dtype: object

In [23]:
ex_series.iloc[1:3]

main course             Wiener Schnitzel
dessert        Schwarzwalder Kirschtorte
dtype: object

### DataFrame
DataFrames are 'two-dimensional, size-mutable, potentially heterogeneous tabular data'. Each DataFrame is eventually a collection of Pandas Series.

There are mupltiple ways to create Pandas dataframes.

In [24]:
# First, create dict to store some data: city population
dc_city_pop = {
    'Tokyo': 37339804,
    'Delhi': 31181376,
    'Shanghai': 27795702,
    'Sao Paulo': 22237472,
    'Mexico City': 21918936,
    'Dhaka': 21741090,
    'Cairo': 21322750,
    'Beijing': 20896820,
    'Mumbai': 20667656,
    'Osaka': 19110616
}

In [26]:
# Cast to pd.Series
ps_city_pop = pd.Series(dc_city_pop)
ps_city_pop

Tokyo          37339804
Delhi          31181376
Shanghai       27795702
Sao Paulo      22237472
Mexico City    21918936
Dhaka          21741090
Cairo          21322750
Beijing        20896820
Mumbai         20667656
Osaka          19110616
dtype: int64

In [27]:
print(ps_city_pop.index)
print(ps_city_pop.values)

Index(['Tokyo', 'Delhi', 'Shanghai', 'Sao Paulo', 'Mexico City', 'Dhaka',
       'Cairo', 'Beijing', 'Mumbai', 'Osaka'],
      dtype='object')
[37339804 31181376 27795702 22237472 21918936 21741090 21322750 20896820
 20667656 19110616]


In [28]:
# Second data dict: City-country correspondence
dc_city_countries = {
    "Tokyo": "Japan",
    "Delhi": "India",
    "Shanghai": "China",
    "Sao Paulo": "Brazil",
    "Mexico City": "Mexico",
    "Dhaka": "Bangladesh",
    "Cairo": "Egypt",
    "Beijing": "China",
    "Mumbai": "India",
    "Osaka": "Japan",
}

In [29]:
ps_city_countries = pd.Series(dc_city_countries)
ps_city_countries

Tokyo               Japan
Delhi               India
Shanghai            China
Sao Paulo          Brazil
Mexico City        Mexico
Dhaka          Bangladesh
Cairo               Egypt
Beijing             China
Mumbai              India
Osaka               Japan
dtype: object

In [30]:
# Create our first DataFrame by concatenating two pd.Series
df_cities = pd.concat([ps_city_pop, ps_city_countries], axis=1) # How does pd.concat work?
df_cities

Unnamed: 0,0,1
Tokyo,37339804,Japan
Delhi,31181376,India
Shanghai,27795702,China
Sao Paulo,22237472,Brazil
Mexico City,21918936,Mexico
Dhaka,21741090,Bangladesh
Cairo,21322750,Egypt
Beijing,20896820,China
Mumbai,20667656,India
Osaka,19110616,Japan


In [33]:
# Columns don't have proper name. Name them.
df_cities.columns

RangeIndex(start=0, stop=2, step=1)

In [35]:
df_cities.columns = ["population", "country"]
df_cities

Unnamed: 0,population,country
Tokyo,37339804,Japan
Delhi,31181376,India
Shanghai,27795702,China
Sao Paulo,22237472,Brazil
Mexico City,21918936,Mexico
Dhaka,21741090,Bangladesh
Cairo,21322750,Egypt
Beijing,20896820,China
Mumbai,20667656,India
Osaka,19110616,Japan


In [37]:
# Accessing rows/columns via .iloc[]
# Note that our array is now two-dimensional!
df_cities.iloc[2:5] # Rows

Unnamed: 0,population,country
Shanghai,27795702,China
Sao Paulo,22237472,Brazil
Mexico City,21918936,Mexico


In [42]:
# Cols
df_cities.iloc[2:5, 1]

Shanghai        China
Sao Paulo      Brazil
Mexico City    Mexico
Name: country, dtype: object

In [43]:
# Using .loc
# list of cities (note the double squared brackets)
df_cities.loc[["Shanghai", "Dhaka", "Osaka"]]

Unnamed: 0,population,country
Shanghai,27795702,China
Dhaka,21741090,Bangladesh
Osaka,19110616,Japan


In [44]:
# list of cities + a column
df_cities.loc[["Shanghai", "Dhaka", "Osaka"], "country"]

Shanghai         China
Dhaka       Bangladesh
Osaka            Japan
Name: country, dtype: object

In [46]:
# a range of cities from the index
df_cities.loc["Tokyo":"Sao Paulo"]

Unnamed: 0,population,country
Tokyo,37339804,Japan
Delhi,31181376,India
Shanghai,27795702,China
Sao Paulo,22237472,Brazil


### Slicing based on condition

In [47]:
df_cities[df_cities.population > 30_000_000] # for human readibility you can use underscore as thousand separator

Unnamed: 0,population,country
Tokyo,37339804,Japan
Delhi,31181376,India


In [48]:
df_cities[df_cities.country.isin(["Japan", "India", "Brazil"])] # Show Boolean indexing

Unnamed: 0,population,country
Tokyo,37339804,Japan
Delhi,31181376,India
Sao Paulo,22237472,Brazil
Mumbai,20667656,India
Osaka,19110616,Japan


In [53]:
df_cities[
    ~df_cities.country.isin(["Japan", "India", "Brazil"])
]  # tilde (~) for 'not in'

Unnamed: 0,population,country
Shanghai,27795702,China
Mexico City,21918936,Mexico
Dhaka,21741090,Bangladesh
Cairo,21322750,Egypt
Beijing,20896820,China


In [56]:
# Index looks useful; move to DataFrame
df_cities.reset_index(drop=False, inplace=True)
df_cities # "index" not a good col name

Unnamed: 0,index,population,country
0,Tokyo,37339804,Japan
1,Delhi,31181376,India
2,Shanghai,27795702,China
3,Sao Paulo,22237472,Brazil
4,Mexico City,21918936,Mexico
5,Dhaka,21741090,Bangladesh
6,Cairo,21322750,Egypt
7,Beijing,20896820,China
8,Mumbai,20667656,India
9,Osaka,19110616,Japan


You can rename certain columns using a dictionary with old name as key and new name as value.

In [57]:
df_cities.rename({"index": "city"}, axis="columns", inplace=True)
df_cities

Unnamed: 0,city,population,country
0,Tokyo,37339804,Japan
1,Delhi,31181376,India
2,Shanghai,27795702,China
3,Sao Paulo,22237472,Brazil
4,Mexico City,21918936,Mexico
5,Dhaka,21741090,Bangladesh
6,Cairo,21322750,Egypt
7,Beijing,20896820,China
8,Mumbai,20667656,India
9,Osaka,19110616,Japan


In [58]:
data = []
data.append(["Tokyo", 37339804, "Japan"])
data.append(["Delhi", 31181376, "India"])
data.append(["Shanghai", 27795702, "China"])

In [59]:
data

[['Tokyo', 37339804, 'Japan'],
 ['Delhi', 31181376, 'India'],
 ['Shanghai', 27795702, 'China']]

In [60]:
df_cities_ = pd.DataFrame(data=data, columns=["city", "population", "country"])
df_cities_

Unnamed: 0,city,population,country
0,Tokyo,37339804,Japan
1,Delhi,31181376,India
2,Shanghai,27795702,China


In [62]:
# Reorder cols
df_cities_ = df_cities_[["city", "country", "population"]]
df_cities_

Unnamed: 0,city,country,population
0,Tokyo,Japan,37339804
1,Delhi,India,31181376
2,Shanghai,China,27795702


In [63]:
# Metadata
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   city        10 non-null     object
 1   population  10 non-null     int64 
 2   country     10 non-null     object
dtypes: int64(1), object(2)
memory usage: 368.0+ bytes


In [64]:
df_cities.shape

(10, 3)

In [65]:
df_cities.shape[0]

10