# Pandas basics: Intro
Pandas is the most frequently used data library in Python.

We wanna start with the two basic objects:
1. `pd.Series`
2. `pd.DataFrame`

In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
# Let's define our first pd.Series
ps = pd.Series(["a", 1, np.pi, 37])

In [4]:
ps

0           a
1           1
2    3.141593
3          37
dtype: object

In [6]:
type(ps)

pandas.core.series.Series

In [7]:
# Access values of pd.Series only
print(ps.values)

['a' 1 3.141592653589793 37]


In [8]:
# It's a numpy.ndarray
type(ps.values)

numpy.ndarray

In [9]:
ps.index

RangeIndex(start=0, stop=4, step=1)

In [12]:
ps[0:2]

0    a
1    1
dtype: object

In [13]:
# Define series with string indexes
ex_series = pd.Series(
    data = ["Mozarella Cheese",
           "Wiener Schnitzel",
           "Schwarzwalder Kirschtorte",
           "Lemonade",
           "Whiskey"],
    index = ["Appetizer",
            "Main course",
            "Dessert",
            "Beverage",
            "Alcohol"]
)

In [14]:
ex_series

Appetizer               Mozarella Cheese
Main course             Wiener Schnitzel
Dessert        Schwarzwalder Kirschtorte
Beverage                        Lemonade
Alcohol                          Whiskey
dtype: object

### Using `loc[]` and `iloc[]` to access elements
1. `.loc[]`: access via index *values*
2. `.iloc[]`: access via index *positions*

In [15]:
# Locate via .loc[]
ex_series.loc[["Appetizer", "Main course"]]

Appetizer      Mozarella Cheese
Main course    Wiener Schnitzel
dtype: object

In [18]:
# Locate via .iloc[]
ex_series.iloc[[0, 1]]

Appetizer      Mozarella Cheese
Main course    Wiener Schnitzel
dtype: object

In [21]:
# We can retrieve a range of data using iloc[]
ex_series.loc["Appetizer":"Dessert"]

Appetizer               Mozarella Cheese
Main course             Wiener Schnitzel
Dessert        Schwarzwalder Kirschtorte
dtype: object

## DataFrames
DataFrame is just a two-dimensional, size-mutable hetergeneous tabular data. Each DataFrame is just a collection `pd.Series`.

In [25]:
# Let's create our first DataFrame
# Define some data
dc_city_pop = {
    'Tokyo': 30000,
    'Delhi': 50000000,
    'Shanghai': 10000000
}

In [26]:
dc_city_pop

{'Tokyo': 30000, 'Delhi': 50000000, 'Shanghai': 10000000}

In [27]:
ps_city_pop = pd.Series(dc_city_pop)
ps_city_pop

Tokyo          30000
Delhi       50000000
Shanghai    10000000
dtype: int64

In [28]:
ps_city_pop.values

array([   30000, 50000000, 10000000], dtype=int64)

In [29]:
ps_city_pop.index

Index(['Tokyo', 'Delhi', 'Shanghai'], dtype='object')

In [30]:
# Create mapping from countries to cities as a dictionary
dc_city_countries = {
    'Tokyo': 'Japan',
    'Delhi': 'India',
    'Shanghai': 'China'
}

# Create pd.Series
ps_city_countries = pd.Series(dc_city_countries)
ps_city_countries

Tokyo       Japan
Delhi       India
Shanghai    China
dtype: object

In [31]:
# Concate pd.Series to create DataFrame
df_cities = pd.concat([ps_city_pop, ps_city_countries], axis=1)
df_cities

Unnamed: 0,0,1
Tokyo,30000,Japan
Delhi,50000000,India
Shanghai,10000000,China


In [32]:
df_cities.columns

RangeIndex(start=0, stop=2, step=1)

In [34]:
# Change column names to something more descriptive
df_cities.columns = ['population', 'country']
df_cities

Unnamed: 0,population,country
Tokyo,30000,Japan
Delhi,50000000,India
Shanghai,10000000,China


In [37]:
# Accessing rows/columns 
df_cities.iloc[0:2, 0]

Tokyo       30000
Delhi    50000000
Name: population, dtype: int64

In [38]:
# Access a single element of the DataFrame
df_cities.iloc[1, 0]

50000000

In [39]:
# We can also use .loc
df_cities.loc[["Shanghai"]]

Unnamed: 0,population,country
Shanghai,10000000,China


In [46]:
# Indexing based on logical conditions
df_cities.country[df_cities.population > 20_000_000]

Delhi    India
Name: country, dtype: object

In [48]:
# Get all cities located in China
df_cities[df_cities.country.isin(["China", "India"])]

Unnamed: 0,population,country
Delhi,50000000,India
Shanghai,10000000,China


In [51]:
df_cities[~df_cities.country.isin(["China", "India"])]

Unnamed: 0,population,country
Tokyo,30000,Japan


In [52]:
df_cities

Unnamed: 0,population,country
Tokyo,30000,Japan
Delhi,50000000,India
Shanghai,10000000,China


In [55]:
# Create column from index and generate new index
df_cities.reset_index(drop=False, inplace=True)

In [56]:
df_cities

Unnamed: 0,index,population,country
0,Tokyo,30000,Japan
1,Delhi,50000000,India
2,Shanghai,10000000,China


In [59]:
df_cities.rename({'cities': 'city'}, axis="columns", inplace=True)
df_cities

Unnamed: 0,city,population,country
0,Tokyo,30000,Japan
1,Delhi,50000000,India
2,Shanghai,10000000,China


In [63]:
# Metadata
df_cities.shape

(3, 3)

In [64]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   city        3 non-null      object
 1   population  3 non-null      int64 
 2   country     3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [66]:
df_cities.city

0       Tokyo
1       Delhi
2    Shanghai
Name: city, dtype: object