In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.1.2'

In [3]:
# Build DataFrame from list of dictionaries ==> List[Dict]
cars_list = [
    { 'country': 'United States', 'drives_right': True, 'cars_per_cap': 809 },
    { 'country': 'Australia', 'drives_right': False, 'cars_per_cap': 731 },
    { 'country': 'Japan', 'drives_right': False, 'cars_per_cap': 588 },
    { 'country': 'India', 'drives_right': False, 'cars_per_cap': 18 },
    { 'country': 'Russia', 'drives_right': True, 'cars_per_cap': 200 },
    { 'country': 'Morocco', 'drives_right': True, 'cars_per_cap': 70 },
    { 'country': 'Egypt', 'drives_right': True, 'cars_per_cap': 45 },
]
cars = pd.DataFrame(cars_list)
cars

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200
5,Morocco,True,70
6,Egypt,True,45


In [4]:
# We can add column brodcasting default value
cars['tmp'] = 0 # you can pass a list of data of corresponding length
cars

Unnamed: 0,country,drives_right,cars_per_cap,tmp
0,United States,True,809,0
1,Australia,False,731,0
2,Japan,False,588,0
3,India,False,18,0
4,Russia,True,200,0
5,Morocco,True,70,0
6,Egypt,True,45,0


In [5]:
# Build cars DataFrame from dictionary of lists ==> Dict[str, list]
cars_dict = { 
    'country': ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt'], 
    'drives_right': [True, False, False, False, True, True, True], 
    'cars_per_cap': [809, 731, 588, 18, 200, 70, 45] 
}
cars = pd.DataFrame(cars_dict)
cars

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200
5,Morocco,True,70
6,Egypt,True,45


In [6]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country       7 non-null      object
 1   drives_right  7 non-null      bool  
 2   cars_per_cap  7 non-null      int64 
dtypes: bool(1), int64(1), object(1)
memory usage: 247.0+ bytes


In [7]:
cars.values

array([['United States', True, 809],
       ['Australia', False, 731],
       ['Japan', False, 588],
       ['India', False, 18],
       ['Russia', True, 200],
       ['Morocco', True, 70],
       ['Egypt', True, 45]], dtype=object)

In [8]:
cars.columns

Index(['country', 'drives_right', 'cars_per_cap'], dtype='object')

In [9]:
cars.index

RangeIndex(start=0, stop=7, step=1)

In [10]:
# Definition of row_labels
row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']

# Specify row labels of cars
cars.index = row_labels
cars

Unnamed: 0,country,drives_right,cars_per_cap
US,United States,True,809
AUS,Australia,False,731
JPN,Japan,False,588
IN,India,False,18
RU,Russia,True,200
MOR,Morocco,True,70
EG,Egypt,True,45


In [11]:
cars.index

Index(['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG'], dtype='object')

In [12]:
cars.to_csv('./datasets/cars.csv')

In [13]:
# if you do not specify index when you read from csv, new index column will be created
cars = pd.read_csv('./datasets/cars.csv')
cars

Unnamed: 0.1,Unnamed: 0,country,drives_right,cars_per_cap
0,US,United States,True,809
1,AUS,Australia,False,731
2,JPN,Japan,False,588
3,IN,India,False,18
4,RU,Russia,True,200
5,MOR,Morocco,True,70
6,EG,Egypt,True,45


In [14]:
cars = pd.read_csv('./datasets/cars.csv', index_col=0)
cars

Unnamed: 0,country,drives_right,cars_per_cap
US,United States,True,809
AUS,Australia,False,731
JPN,Japan,False,588
IN,India,False,18
RU,Russia,True,200
MOR,Morocco,True,70
EG,Egypt,True,45


## loc and iloc

In [15]:
# getting single value
cars['country']['EG']

'Egypt'

### columns wise slice

In [16]:
# When you use one pair of square brackets, you get pandas Serias.
# Plus you cannot select multiple columns in that way, so you'd better use double pair of square brackets!
print('Slice with one element', type(cars['country']))
try:
    print('Slice with two element', type(cars['country', 'cars_per_cap']))
except KeyError:
    print("You cannot do slicing like this: `cars['country', 'cars_per_cap']`")
print(type(cars[['country']]))
print(type(cars[['country', 'cars_per_cap']]))
cars[['country', 'cars_per_cap']]

Slice with one element <class 'pandas.core.series.Series'>
You cannot do slicing like this: `cars['country', 'cars_per_cap']`
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,cars_per_cap
US,United States,809
AUS,Australia,731
JPN,Japan,588
IN,India,18
RU,Russia,200
MOR,Morocco,70
EG,Egypt,45


### loc (label-base column-wise slice)

In [17]:
# use double square brackets as well
cars.loc[['US', 'AUS', 'IN']]

Unnamed: 0,country,drives_right,cars_per_cap
US,United States,True,809
AUS,Australia,False,731
IN,India,False,18


In [18]:
# with loc, you can also do column-wise slicing
cars.loc[['US', 'AUS', 'IN'], ['country', 'cars_per_cap']]

Unnamed: 0,country,cars_per_cap
US,United States,809
AUS,Australia,731
IN,India,18


In [19]:
cars.loc[:, ['country', 'cars_per_cap']]

Unnamed: 0,country,cars_per_cap
US,United States,809
AUS,Australia,731
JPN,Japan,588
IN,India,18
RU,Russia,200
MOR,Morocco,70
EG,Egypt,45


### iloc (index-base slicing)

In [20]:
# cars.iloc[[1], ['country', 'cars_per_cap']] # ==> not gonna work, because the iloc accepts only integer
# cars.iloc[[1:3], :] # ==> not gonna work because of [1:3]
cars.iloc[range(1, 4), :]

Unnamed: 0,country,drives_right,cars_per_cap
AUS,Australia,False,731
JPN,Japan,False,588
IN,India,False,18


In [21]:
cars.iloc[0:3, [1, 2]]

Unnamed: 0,drives_right,cars_per_cap
US,True,809
AUS,False,731
JPN,False,588


## Logical slicing

In [22]:
cars[cars['drives_right']]

Unnamed: 0,country,drives_right,cars_per_cap
US,United States,True,809
RU,Russia,True,200
MOR,Morocco,True,70
EG,Egypt,True,45


In [23]:
cars[cars['cars_per_cap'] > 800]

Unnamed: 0,country,drives_right,cars_per_cap
US,United States,True,809


In [24]:
# Create medium: observations with cars_per_cap between 100 and 500
cpc = cars['cars_per_cap']
between = np.logical_and(cpc > 100, cpc < 500)
medium = cars[between]

# Print medium
medium

Unnamed: 0,country,drives_right,cars_per_cap
RU,Russia,True,200


## Looping over DataFrame

In [25]:
# Iterates over colums
for el in cars:
    print(el)

country
drives_right
cars_per_cap


In [26]:
# Iterate through rows. Label returns row index (in this case index is string), and pandas Series
for label, row in cars.iterrows():
    print(label)
    print(row, type(row), sep=' ')

US
country         United States
drives_right             True
cars_per_cap              809
Name: US, dtype: object <class 'pandas.core.series.Series'>
AUS
country         Australia
drives_right        False
cars_per_cap          731
Name: AUS, dtype: object <class 'pandas.core.series.Series'>
JPN
country         Japan
drives_right    False
cars_per_cap      588
Name: JPN, dtype: object <class 'pandas.core.series.Series'>
IN
country         India
drives_right    False
cars_per_cap       18
Name: IN, dtype: object <class 'pandas.core.series.Series'>
RU
country         Russia
drives_right      True
cars_per_cap       200
Name: RU, dtype: object <class 'pandas.core.series.Series'>
MOR
country         Morocco
drives_right       True
cars_per_cap         70
Name: MOR, dtype: object <class 'pandas.core.series.Series'>
EG
country         Egypt
drives_right     True
cars_per_cap       45
Name: EG, dtype: object <class 'pandas.core.series.Series'>


In [27]:
# Adding a new row to a DataFrame. Memory unefficient way
for lab, row in cars.iterrows():
    cars.loc[lab, 'name_length'] = len(row['country'])
#     cars.loc[[lab], ['name_length']] = len(row['country']) # - works as well
cars

Unnamed: 0,country,drives_right,cars_per_cap,name_length
US,United States,True,809,13.0
AUS,Australia,False,731,9.0
JPN,Japan,False,588,5.0
IN,India,False,18,5.0
RU,Russia,True,200,6.0
MOR,Morocco,True,70,7.0
EG,Egypt,True,45,5.0


In [28]:
# you'd better use apply instead
cars['name_length'] = cars['country'].apply(len)
cars['COUNTRY'] = cars['country'].apply(str.upper)
cars

Unnamed: 0,country,drives_right,cars_per_cap,name_length,COUNTRY
US,United States,True,809,13,UNITED STATES
AUS,Australia,False,731,9,AUSTRALIA
JPN,Japan,False,588,5,JAPAN
IN,India,False,18,5,INDIA
RU,Russia,True,200,6,RUSSIA
MOR,Morocco,True,70,7,MOROCCO
EG,Egypt,True,45,5,EGYPT


## Transforming Data

In [29]:
homeless_data = pd.read_pickle('./datasets/homeless_data.pkl')
homeless_data.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570.0,864.0,4887681
1,Pacific,Alaska,1434.0,582.0,735139
2,Mountain,Arizona,7259.0,2606.0,7158024
3,West South Central,Arkansas,2280.0,432.0,3009733
4,Pacific,California,109008.0,20964.0,39461588


In [30]:
homeless_data.sort_values(['state_pop', 'state'], ascending=[False, True]).head()

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008.0,20964.0,39461588
43,West South Central,Texas,19199.0,6111.0,28628666
9,South Atlantic,Florida,21443.0,9587.0,21244317
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
38,Mid-Atlantic,Pennsylvania,8163.0,5349.0,12800922


Instead of using logical slicing from numpy, you can also use `|`, `&`. But don't forget to put parentheses around!

In [31]:
homeless_data[(homeless_data['state'] == 'California') | (homeless_data['region'] == 'South Atlantic')]

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008.0,20964.0,39461588
7,South Atlantic,Delaware,708.0,374.0,965479
8,South Atlantic,District of Columbia,3770.0,3134.0,701547
9,South Atlantic,Florida,21443.0,9587.0,21244317
10,South Atlantic,Georgia,6943.0,2556.0,10511131
20,South Atlantic,Maryland,4914.0,2230.0,6035802
33,South Atlantic,North Carolina,6451.0,2817.0,10381615
40,South Atlantic,South Carolina,3082.0,851.0,5084156
46,South Atlantic,Virginia,3928.0,2047.0,8501286
48,South Atlantic,West Virginia,1021.0,222.0,1804291


Pandas also provides is in funtion

In [32]:
homeless_data[homeless_data['state'].isin(['New York', 'Florida'])]

Unnamed: 0,region,state,individuals,family_members,state_pop
9,South Atlantic,Florida,21443.0,9587.0,21244317
32,Mid-Atlantic,New York,39827.0,52070.0,19530351


In [33]:
homeless_data['tmp'] = homeless_data['region'] + ': ' + homeless_data['state']
homeless_data.head()

Unnamed: 0,region,state,individuals,family_members,state_pop,tmp
0,East South Central,Alabama,2570.0,864.0,4887681,East South Central: Alabama
1,Pacific,Alaska,1434.0,582.0,735139,Pacific: Alaska
2,Mountain,Arizona,7259.0,2606.0,7158024,Mountain: Arizona
3,West South Central,Arkansas,2280.0,432.0,3009733,West South Central: Arkansas
4,Pacific,California,109008.0,20964.0,39461588,Pacific: California


## Data aggregation

In [34]:
walmart_sales = pd.read_pickle('./datasets/walmart_sales.pkl')
walmart_sales

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.50,False,5.727778,0.679451,8.106
1,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
2,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
3,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
4,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106
...,...,...,...,...,...,...,...,...,...
413114,45,B,4,2012-10-26,24627.94,False,14.916667,1.025516,8.667
413115,45,B,5,2012-10-26,13256.59,False,14.916667,1.025516,8.667
413116,45,B,6,2012-10-26,1086.31,False,14.916667,1.025516,8.667
413117,45,B,7,2012-10-26,20356.73,False,14.916667,1.025516,8.667


In [35]:
print(walmart_sales['date'].min())
print(walmart_sales['date'].max())

2010-02-05 00:00:00
2012-10-26 00:00:00


You can aggregate data with a callback

In [36]:
# inter-quartile range
def iqr(column: pd.Series) -> pd.Series:
    return column.quantile(0.75) - column.quantile(0.25)

In [37]:
walmart_sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([np.median, iqr])

Unnamed: 0,temperature_c,fuel_price_usd_per_l,unemployment
median,16.75,0.911922,7.852
iqr,15.3,0.211866,1.672


In [38]:
sales_sorted = walmart_sales.sort_values(['date'], ascending=[True])

# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
sales_sorted['cum_weekly_sales'] = sales_sorted['weekly_sales'].cumsum()

# Get the cumulative max of weekly_sales, add as cum_max_sales col
sales_sorted['cum_max_sales'] = sales_sorted['weekly_sales'].cummax()

# See the columns you calculated
sales_sorted[['date', 'weekly_sales', 'cum_weekly_sales', 'cum_max_sales']]

Unnamed: 0,date,weekly_sales,cum_weekly_sales,cum_max_sales
0,2010-02-05,24924.50,2.492450e+04,24924.50
10287,2010-02-05,2357.00,2.728150e+04,24924.50
10286,2010-02-05,30328.88,5.761038e+04,30328.88
10285,2010-02-05,27.50,5.763788e+04,30328.88
10284,2010-02-05,8788.47,6.642635e+04,30328.88
...,...,...,...,...
157399,2012-10-26,2167.70,6.648959e+09,693099.36
157400,2012-10-26,7285.14,6.648966e+09,693099.36
157401,2012-10-26,15279.13,6.648981e+09,693099.36
157403,2012-10-26,18290.34,6.649000e+09,693099.36


In [39]:
type_and_department_unique = walmart_sales.drop_duplicates(subset=['type', 'department'])
type_and_department_unique

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.50,False,5.727778,0.679451,8.106
1,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
2,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
3,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
4,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106
...,...,...,...,...,...,...,...,...,...
289586,30,C,33,2011-04-08,12.00,False,20.344444,0.956832,7.931
291874,30,C,19,2012-02-17,2.38,False,7.238889,0.927244,7.057
320587,34,A,65,2010-02-05,41057.25,False,1.911111,0.686319,9.521
348760,37,C,71,2010-12-24,39.00,False,15.055556,0.762401,8.476


In [40]:
type_and_department_unique['type'].value_counts(sort=True)

A    81
B    80
C    66
Name: type, dtype: int64

In [41]:
# Returns % of each type. For instance, type A accounts for ~ 35% of all type
type_and_department_unique['type'].value_counts(normalize=True)

A    0.356828
B    0.352423
C    0.290749
Name: type, dtype: float64

In [42]:
walmart_sales.groupby(['type', 'department'])['temperature_c'].mean()

type  department
A     1             16.127015
      2             16.127015
      3             16.127015
      4             16.127015
      5             16.063817
                      ...    
C     95            19.730258
      96            19.730258
      97            19.730258
      98            19.730258
      99            19.789153
Name: temperature_c, Length: 227, dtype: float64

In [43]:
walmart_sales.groupby(['type', 'department'])['temperature_c'].agg([iqr, np.mean])

Unnamed: 0_level_0,Unnamed: 1_level_0,iqr,mean
type,department,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,15.258333,16.127015
A,2,15.258333,16.127015
A,3,15.258333,16.127015
A,4,15.258333,16.127015
A,5,15.247222,16.063817
...,...,...,...
C,95,15.120833,19.730258
C,96,15.120833,19.730258
C,97,15.120833,19.730258
C,98,15.120833,19.730258


Though groupby method is useful, you can do group data without it

In [44]:
sales_all = walmart_sales['weekly_sales'].agg(sum)
print('sales_all', sales_all)

sales_all 6649037445.509999


In [45]:
sales_A = walmart_sales[walmart_sales['type'] == 'A']['weekly_sales'].agg(sum)
sales_B = walmart_sales[walmart_sales['type'] == 'B']['weekly_sales'].agg(sum)
sales_C = walmart_sales[walmart_sales['type'] == 'C']['weekly_sales'].agg(sum)
print('sales_A', sales_A)
print('sales_B', sales_B)
print('sales_C', sales_C)

sales_A 4331014722.749999
sales_B 1912519195.2199998
sales_C 405503527.53999996


In [46]:
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
print('sales_propn_by_type', sales_propn_by_type)

sales_propn_by_type [0.65137469 0.28763851 0.0609868 ]


The same thing but with groupby

In [47]:
print('sales_propn_by_type', 
      walmart_sales.groupby('type')['weekly_sales'].agg(sum) / walmart_sales['weekly_sales'].agg(sum))

sales_propn_by_type type
A    0.651375
B    0.287639
C    0.060987
Name: weekly_sales, dtype: float64


### Pivot table

* values - on which to perform aggregation functions (becomes MIDDLE level index), 
* index - columns which should be used for index,
* columns - columns which be used to aggregate values (becomes THE LAST level index),
* aggfunc - list of callbacks (becomes TOP LEVEL INDEX),
* fill_value - specifies default value when there is a missing value,
* margin - set to true if you want to have all row at the bottom,

In [48]:
walmart_sales.pivot_table(values=['weekly_sales'], 
                          index=['type'], 
                          columns=None, 
                          aggfunc=[np.mean, np.median, iqr], 
                          fill_value=0, 
                          margins=True)

Unnamed: 0_level_0,mean,median,iqr
Unnamed: 0_level_1,weekly_sales,weekly_sales,weekly_sales
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,20099.568043,10105.17,23042.09
B,12335.331875,6269.02,13615.7525
C,9519.532538,1149.67,12563.02
All,16094.726811,7682.47,18290.955
