# Pandas

In [1]:
import pandas as pd

In [2]:
dataSeries = pd.read_csv("https://andybek.com/pandas-drinks", usecols=['wine_servings', 'country'], index_col='country').squeeze()
dataSeries.head(5)

country
Afghanistan      NaN
Albania         54.0
Algeria         14.0
Andorra        312.0
Angola          45.0
Name: wine_servings, dtype: float64

In [3]:
# remove null
data = dataSeries[dataSeries.notnull()]
data.head(5)

country
Albania               54.0
Algeria               14.0
Andorra              312.0
Angola                45.0
Antigua & Barbuda     45.0
Name: wine_servings, dtype: float64

In [4]:
data.sum()

8221.0

In [5]:
data.hasnans

False

# index vy min/max
* idxmin(): 
    - returns the label od the row with minimum value
* idxmax(): 
    - returns the label of the row with maximum value

* [note]: if multiple min/max value occur only return the first value.

In [6]:
# most wine consumed country
data.max()

339.0

In [7]:
data[data == data.max()]

country
Portugal    339.0
Name: wine_servings, dtype: float64

In [8]:
data.idxmax()

'Portugal'

In [9]:
data[data.idxmax()]

339.0

In [10]:
# minimum wine consumed country
data.min()

1.0

In [11]:
data.idxmin()

'Brunei'

In [12]:
data[data == data.min()].value_counts()

wine_servings
1.0    28
Name: count, dtype: int64

In [13]:
data[data == data.min()]

country
Brunei                      1.0
Cambodia                    1.0
Canada                      1.0
Central African Republic    1.0
Chad                        1.0
Comoros                     1.0
DR Congo                    1.0
Egypt                       1.0
Fiji                        1.0
Gambia                      1.0
Ghana                       1.0
Guyana                      1.0
Haiti                       1.0
Jordan                      1.0
Kiribati                    1.0
Malawi                      1.0
Mali                        1.0
Morocco                     1.0
Namibia                     1.0
Nicaragua                   1.0
Niger                       1.0
Oman                        1.0
Papua New Guinea            1.0
Philippines                 1.0
Solomon Islands             1.0
Thailand                    1.0
Tanzania                    1.0
Vietnam                     1.0
Name: wine_servings, dtype: float64

# sorting
* sort_values()

In [14]:
data.sort_values(ascending=False)

country
Portugal                    339.0
Andorra                     312.0
Denmark                     278.0
Slovenia                    276.0
Luxembourg                  271.0
                            ...  
Jordan                        1.0
Solomon Islands               1.0
Central African Republic      1.0
Haiti                         1.0
Chad                          1.0
Name: wine_servings, Length: 162, dtype: float64

In [15]:
data.sort_values(ascending=False)

country
Portugal                    339.0
Andorra                     312.0
Denmark                     278.0
Slovenia                    276.0
Luxembourg                  271.0
                            ...  
Jordan                        1.0
Solomon Islands               1.0
Central African Republic      1.0
Haiti                         1.0
Chad                          1.0
Name: wine_servings, Length: 162, dtype: float64

# largest/smallest 
* nlargest()
* nsmallest()

In [16]:
data.sort_values(ascending=False)[:10]

country
Portugal             339.0
Andorra              312.0
Denmark              278.0
Slovenia             276.0
Luxembourg           271.0
Croatia              254.0
Italy                237.0
Equatorial Guinea    233.0
Argentina            221.0
Greece               218.0
Name: wine_servings, dtype: float64

In [17]:
data.nlargest(10)

country
Portugal             339.0
Andorra              312.0
Denmark              278.0
Slovenia             276.0
Luxembourg           271.0
Croatia              254.0
Italy                237.0
Equatorial Guinea    233.0
Argentina            221.0
Greece               218.0
Name: wine_servings, dtype: float64

In [18]:
data.nsmallest(10)

country
Brunei                      1.0
Cambodia                    1.0
Canada                      1.0
Central African Republic    1.0
Chad                        1.0
Comoros                     1.0
DR Congo                    1.0
Egypt                       1.0
Fiji                        1.0
Gambia                      1.0
Name: wine_servings, dtype: float64

# sort by index
* sort_index()

In [19]:
data.sort_index()

country
Albania               54.0
Algeria               14.0
Andorra              312.0
Angola                45.0
Antigua & Barbuda     45.0
                     ...  
Vanuatu               11.0
Venezuela              3.0
Vietnam                1.0
Zambia                 4.0
Zimbabwe               4.0
Name: wine_servings, Length: 162, dtype: float64

# fifty plus wine servings

In [31]:
fifty_plus = data[data > 50]
fifty_plus.head(5).sort_values()

country
Albania       54.0
Austria      191.0
Australia    212.0
Argentina    221.0
Andorra      312.0
Name: wine_servings, dtype: float64

In [24]:
fifty_plus.count()

48

In [33]:
lowest_fifty = fifty_plus.nsmallest(20)
lowest_fifty.sort_values()

country
Bahamas                51.0
Seychelles             51.0
Albania                54.0
Lithuania              56.0
Poland                 56.0
Estonia                59.0
Gabon                  59.0
Latvia                 62.0
St. Lucia              71.0
Russian Federation     73.0
Cook Islands           74.0
Paraguay               74.0
Iceland                78.0
South Africa           81.0
USA                    84.0
Macedonia              86.0
Bulgaria               94.0
Finland                97.0
Spain                 112.0
Cyprus                113.0
Name: wine_servings, dtype: float64

In [28]:
lowest_fifty.mean()

74.25

In [29]:
lowest_fifty.median()

73.5

In [30]:
lowest_fifty.std()

19.07292100831631

# Basic arithmatic
* add(fill-value=0)
* sub()
* divide()
* multiply()

In [36]:
new_data = data.head(5)
new_data

country
Albania               54.0
Algeria               14.0
Andorra              312.0
Angola                45.0
Antigua & Barbuda     45.0
Name: wine_servings, dtype: float64

In [43]:
to_add = pd.Series({'test_country': 100, 'test_country2': 250})
to_add

test_country     100
test_country2    250
dtype: int64

In [41]:
new_data + to_add

Albania             NaN
Algeria             NaN
Andorra             NaN
Angola              NaN
Antigua & Barbuda   NaN
test_country        NaN
dtype: float64

In [46]:
series_added = new_data.add(to_add, fill_value=0)
series_added

Albania               54.0
Algeria               14.0
Andorra              312.0
Angola                45.0
Antigua & Barbuda     45.0
test_country         100.0
test_country2        250.0
dtype: float64

In [47]:
series_added - to_add

Albania              NaN
Algeria              NaN
Andorra              NaN
Angola               NaN
Antigua & Barbuda    NaN
test_country         0.0
test_country2        0.0
dtype: float64

In [51]:
series_added.sub(to_add, fill_value=0)

Albania               54.0
Algeria               14.0
Andorra              312.0
Angola                45.0
Antigua & Barbuda     45.0
test_country           0.0
test_country2          0.0
dtype: float64

In [53]:
series_added.divide(to_add, fill_value=1)

Albania               54.0
Algeria               14.0
Andorra              312.0
Angola                45.0
Antigua & Barbuda     45.0
test_country           1.0
test_country2          1.0
dtype: float64

In [55]:
series_added.multiply(to_add, fill_value=1)

Albania                 54.0
Algeria                 14.0
Andorra                312.0
Angola                  45.0
Antigua & Barbuda       45.0
test_country         10000.0
test_country2        62500.0
dtype: float64

# calculate varience and std

In [56]:
data.var()

5796.525611532858

In [60]:
nominatior = (data.subtract(data.mean())**2).sum()
nominatior

933240.6234567902

In [64]:
denominator = data.count() - 1
denominator

161

In [65]:
varience = nominatior / denominator
varience

5796.525611532858

# std
* square root of varience

In [68]:
data.std()

76.13491716376171

In [69]:
varience ** (1/2)

76.13491716376171

# cumulative sum

In [74]:
data.sort_values()

country
Thailand             1.0
Solomon Islands      1.0
Brunei               1.0
Haiti                1.0
Cambodia             1.0
                   ...  
Luxembourg         271.0
Slovenia           276.0
Denmark            278.0
Andorra            312.0
Portugal           339.0
Name: wine_servings, Length: 162, dtype: float64

In [77]:
data.sum()

8221.0

In [76]:
data.sort_values().cumsum()

country
Thailand              1.0
Solomon Islands       2.0
Brunei                3.0
Haiti                 4.0
Cambodia              5.0
                    ...  
Luxembourg         7016.0
Slovenia           7292.0
Denmark            7570.0
Andorra            7882.0
Portugal           8221.0
Name: wine_servings, Length: 162, dtype: float64

# cumulative product

In [78]:
data.prod()

3.427611505218281e+183

In [79]:
data.cumprod()

country
Albania               5.400000e+01
Algeria               7.560000e+02
Andorra               2.358720e+05
Angola                1.061424e+07
Antigua & Barbuda     4.776408e+08
                         ...      
Vanuatu              7.140857e+181
Venezuela            2.142257e+182
Vietnam              2.142257e+182
Zambia               8.569029e+182
Zimbabwe             3.427612e+183
Name: wine_servings, Length: 162, dtype: float64

# cumulative min max

In [80]:
data.min()

1.0

In [81]:
data.cummin()

country
Albania              54.0
Algeria              14.0
Andorra              14.0
Angola               14.0
Antigua & Barbuda    14.0
                     ... 
Vanuatu               1.0
Venezuela             1.0
Vietnam               1.0
Zambia                1.0
Zimbabwe              1.0
Name: wine_servings, Length: 162, dtype: float64

In [82]:
data.max()

339.0

In [83]:
data.cummax()

country
Albania               54.0
Algeria               54.0
Andorra              312.0
Angola               312.0
Antigua & Barbuda    312.0
                     ...  
Vanuatu              339.0
Venezuela            339.0
Vietnam              339.0
Zambia               339.0
Zimbabwe             339.0
Name: wine_servings, Length: 162, dtype: float64

# difference between one element to previous element
* diff(period=0)
* the first discrete element-wise difference in a series
* works with time series data
* a = v1, b = v2, c = v3
    - periods=1:
        - v2 - v1, v3 - v2
    - periods=-1
        - v1 - v2, v2 - v3
    - periods=2
        - v3 - v1, v4 - v2

In [84]:
data.head(5)

country
Albania               54.0
Algeria               14.0
Andorra              312.0
Angola                45.0
Antigua & Barbuda     45.0
Name: wine_servings, dtype: float64

In [85]:
data.diff().head(5)

country
Albania                NaN
Algeria              -40.0
Andorra              298.0
Angola              -267.0
Antigua & Barbuda      0.0
Name: wine_servings, dtype: float64