# Chapter 3 : pandas

# pandas DataFrames

In [1]:
import numpy as np
import scipy as sp
import pandas as pd

## Load the data file into data frame

In [2]:
from pandas.io.parsers import read_csv

df = read_csv("WHO_first9cols.csv")
print("Dataframe:\n", df)

Dataframe:
                       Country  CountryID  Continent  \
0                 Afghanistan          1          1   
1                     Albania          2          2   
2                     Algeria          3          3   
3                     Andorra          4          2   
4                      Angola          5          3   
5         Antigua and Barbuda          6          4   
6                   Argentina          7          5   
7                     Armenia          8          2   
8                   Australia          9          6   
9                     Austria         10          2   
10                 Azerbaijan         11          2   
11                    Bahamas         12          4   
12                    Bahrain         13          1   
13                 Bangladesh         14          7   
14                   Barbados         15          4   
15                    Belarus         16          2   
16                    Belgium         17          2  

In [3]:
print("Shape:\n", df.shape)
print("\n")
print("Length:\n", len(df))
print("\n")
print("Column Headers:\n", df.columns)
print("\n")
print("Data types:\n", df.dtypes)
print("\n")
print("Index:\n", df.index)
print("\n")
print("Values:\n", df.values)

Shape:
 (202, 9)


Length:
 202


Column Headers:
 Index(['Country', 'CountryID', 'Continent', 'Adolescent fertility rate (%)',
       'Adult literacy rate (%)',
       'Gross national income per capita (PPP international $)',
       'Net primary school enrolment ratio female (%)',
       'Net primary school enrolment ratio male (%)',
       'Population (in thousands) total'],
      dtype='object')


Data types:
 Country                                                    object
CountryID                                                   int64
Continent                                                   int64
Adolescent fertility rate (%)                             float64
Adult literacy rate (%)                                   float64
Gross national income per capita (PPP international $)    float64
Net primary school enrolment ratio female (%)             float64
Net primary school enrolment ratio male (%)               float64
Population (in thousands) total                        

# pandas Series

In [4]:
country_col = df["Country"]
print("Type df:\n", type(df), "\n")
print("Type country col:\n", type(country_col), "\n")

Type df:
 <class 'pandas.core.frame.DataFrame'> 

Type country col:
 <class 'pandas.core.series.Series'> 



In [5]:
print("Series shape:\n", country_col.shape, "\n")
print("Series index:\n", country_col.index, "\n")
print("Series values:\n", country_col.values, "\n")
print("Series name:\n", country_col.name, "\n")

Series shape:
 (202,) 

Series index:
 RangeIndex(start=0, stop=202, step=1) 

Series values:
 ['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada'
 'Cape Verde' 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia'
 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.' 'Cook Islands' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'Denmark'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'French Polynesia' 'Gabon' 'Gambia' 'Georgia'
 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'Ho

In [6]:
print("Last 2 countries:\n", country_col[-2:], "\n")
print("Last 2 countries type:\n", type(country_col[-2:]), "\n")

Last 2 countries:
 200      Zambia
201    Zimbabwe
Name: Country, dtype: object 

Last 2 countries type:
 <class 'pandas.core.series.Series'> 



In [7]:
last_col = df.columns[-1]
print("Last df column signs:\n", last_col, np.sign(df[last_col]), "\n")

Last df column signs:
 Population (in thousands) total 0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
5      1.0
6      1.0
7      1.0
8      1.0
9      1.0
10     1.0
11     1.0
12     1.0
13     1.0
14     1.0
15     1.0
16     1.0
17     1.0
18     1.0
19     NaN
20     1.0
21     1.0
22     1.0
23     1.0
24     1.0
25     1.0
26     1.0
27     1.0
28     1.0
29     1.0
      ... 
172    1.0
173    1.0
174    1.0
175    1.0
176    NaN
177    1.0
178    1.0
179    1.0
180    1.0
181    1.0
182    1.0
183    1.0
184    1.0
185    1.0
186    1.0
187    1.0
188    1.0
189    1.0
190    1.0
191    1.0
192    1.0
193    1.0
194    1.0
195    1.0
196    1.0
197    1.0
198    NaN
199    1.0
200    1.0
201    1.0
Name: Population (in thousands) total, Length: 202, dtype: float64 



In [8]:
np.sum([0, np.nan])

nan

In [9]:
df.dtypes

Country                                                    object
CountryID                                                   int64
Continent                                                   int64
Adolescent fertility rate (%)                             float64
Adult literacy rate (%)                                   float64
Gross national income per capita (PPP international $)    float64
Net primary school enrolment ratio female (%)             float64
Net primary school enrolment ratio male (%)               float64
Population (in thousands) total                           float64
dtype: object

In [10]:
print(np.sum(df[last_col] - df[last_col].values))

0.0


# Querying Data in pandas

In [12]:
import quandl

In [13]:
sunspots = quandl.get("SIDC/SUNSPOTS_A")

In [14]:
print("Head 2:\n", sunspots.head(2) )

Head 2:
             Yearly Mean Total Sunspot Number  Yearly Mean Standard Deviation  \
Date                                                                           
1700-12-31                               8.3                             NaN   
1701-12-31                              18.3                             NaN   

            Number of Observations  Definitive/Provisional Indicator  
Date                                                                  
1700-12-31                     NaN                               1.0  
1701-12-31                     NaN                               1.0  


In [15]:
print("Tail 2:\n", sunspots.tail(2))

Tail 2:
             Yearly Mean Total Sunspot Number  Yearly Mean Standard Deviation  \
Date                                                                           
2016-12-31                              39.8                             3.9   
2017-12-31                              21.7                             2.6   

            Number of Observations  Definitive/Provisional Indicator  
Date                                                                  
2016-12-31                  9940.0                               1.0  
2017-12-31                 11020.0                               0.0  


In [16]:
last_date = sunspots.index[-1]
print("Last value:\n",sunspots.loc[last_date])

Last value:
 Yearly Mean Total Sunspot Number       21.7
Yearly Mean Standard Deviation          2.6
Number of Observations              11020.0
Definitive/Provisional Indicator        0.0
Name: 2017-12-31 00:00:00, dtype: float64


In [17]:
print("Values slice by date:\n", sunspots["20020101": "20131231"])

Values slice by date:
             Yearly Mean Total Sunspot Number  Yearly Mean Standard Deviation  \
Date                                                                           
2002-12-31                             163.6                             9.8   
2003-12-31                              99.3                             7.1   
2004-12-31                              65.3                             5.9   
2005-12-31                              45.8                             4.7   
2006-12-31                              24.7                             3.5   
2007-12-31                              12.6                             2.7   
2008-12-31                               4.2                             2.5   
2009-12-31                               4.8                             2.5   
2010-12-31                              24.9                             3.4   
2011-12-31                              80.8                             6.7   
2012-12-31       

In [18]:
print("Slice from a list of indices:\n", sunspots.iloc[[2, 4, -4, -2]])

Slice from a list of indices:
             Yearly Mean Total Sunspot Number  Yearly Mean Standard Deviation  \
Date                                                                           
1702-12-31                              26.7                             NaN   
1704-12-31                              60.0                             NaN   
2014-12-31                             113.3                             8.0   
2016-12-31                              39.8                             3.9   

            Number of Observations  Definitive/Provisional Indicator  
Date                                                                  
1702-12-31                     NaN                               1.0  
1704-12-31                     NaN                               1.0  
2014-12-31                  5273.0                               1.0  
2016-12-31                  9940.0                               1.0  


In [19]:
print("Scalar with Iloc:", sunspots.iloc[0, 0])
print("Scalar with iat", sunspots.iat[1, 0])

Scalar with Iloc: 8.3
Scalar with iat 18.3


In [20]:
print("Boolean selection:\n", sunspots[sunspots > sunspots.mean()])

Boolean selection:
             Yearly Mean Total Sunspot Number  Yearly Mean Standard Deviation  \
Date                                                                           
1700-12-31                               NaN                             NaN   
1701-12-31                               NaN                             NaN   
1702-12-31                               NaN                             NaN   
1703-12-31                               NaN                             NaN   
1704-12-31                               NaN                             NaN   
1705-12-31                              96.7                             NaN   
1706-12-31                               NaN                             NaN   
1707-12-31                               NaN                             NaN   
1708-12-31                               NaN                             NaN   
1709-12-31                               NaN                             NaN   
1710-12-31          

In [21]:
print("Boolean selection with column label:\n", sunspots[sunspots['Number of Observations'] > sunspots['Number of Observations'].mean()])

Boolean selection with column label:
             Yearly Mean Total Sunspot Number  Yearly Mean Standard Deviation  \
Date                                                                           
1981-12-31                             198.9                            13.1   
1982-12-31                             162.4                            12.1   
1983-12-31                              91.0                             7.6   
1984-12-31                              60.5                             5.9   
1985-12-31                              20.6                             3.7   
1986-12-31                              14.8                             3.5   
1987-12-31                              33.9                             3.7   
1988-12-31                             123.0                             8.4   
1989-12-31                             211.1                            12.8   
1990-12-31                             191.8                            11.2   
19

# Statistics with pandas DataFrame

In [22]:
import quandl

# Data from http://www.quandl.com/SIDC/SUNSPOTS_A-Sunspot-Numbers-Annual
# PyPi url https://pypi.python.org/pypi/Quandl
sunspots = quandl.get("SIDC/SUNSPOTS_A")
print("Describe", sunspots.describe(),"\n")
print("Non NaN observations", sunspots.count(),"\n")
print("MAD", sunspots.mad(),"\n")
print("Median", sunspots.median(),"\n")
print("Min", sunspots.min(),"\n")
print("Max", sunspots.max(),"\n")
print("Mode", sunspots.mode(),"\n")
print("Standard Deviation", sunspots.std(),"\n")
print("Variance", sunspots.var(),"\n")
print("Skewness", sunspots.skew(),"\n")
print("Kurtosis", sunspots.kurt(),"\n")

Describe        Yearly Mean Total Sunspot Number  Yearly Mean Standard Deviation  \
count                        318.000000                      200.000000   
mean                          79.196855                        7.982500   
std                           61.985539                        3.818567   
min                            0.000000                        1.700000   
25%                           24.950000                        4.700000   
50%                           66.250000                        7.650000   
75%                          116.025000                       10.425000   
max                          269.300000                       19.100000   

       Number of Observations  Definitive/Provisional Indicator  
count              200.000000                        318.000000  
mean              1515.440000                          0.996855  
std               2548.854285                          0.056077  
min                150.000000                      

# Data Aggregation

In [23]:
import pandas as pd
from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
import numpy as np

seed(42)

df = pd.DataFrame({'Weather' : ['cold', 'hot', 'cold', 'hot',
   'cold', 'hot', 'cold'],
   'Food' : ['soup', 'soup', 'icecream', 'chocolate',
   'icecream', 'icecream', 'soup'],
   'Price' : 10 * rand(7), 'Number' : randint(1, 9)})

print(df)

        Food  Number     Price Weather
0       soup       8  3.745401    cold
1       soup       8  9.507143     hot
2   icecream       8  7.319939    cold
3  chocolate       8  5.986585     hot
4   icecream       8  1.560186    cold
5   icecream       8  1.559945     hot
6       soup       8  0.580836    cold


In [24]:
weather_group = df.groupby('Weather')

i = 0

for name, group in weather_group:
   i = i + 1
   print("Group", i, name)
   print(group)

Group 1 cold
       Food  Number     Price Weather
0      soup       8  3.745401    cold
2  icecream       8  7.319939    cold
4  icecream       8  1.560186    cold
6      soup       8  0.580836    cold
Group 2 hot
        Food  Number     Price Weather
1       soup       8  9.507143     hot
3  chocolate       8  5.986585     hot
5   icecream       8  1.559945     hot


In [25]:
print("Weather group first\n", weather_group.first())
print("Weather group last\n", weather_group.last())
print("Weather group mean\n", weather_group.mean())

Weather group first
          Food  Number     Price
Weather                        
cold     soup       8  3.745401
hot      soup       8  9.507143
Weather group last
              Food  Number     Price
Weather                            
cold         soup       8  0.580836
hot      icecream       8  1.559945
Weather group mean
          Number     Price
Weather                  
cold          8  3.301591
hot           8  5.684558


In [26]:
wf_group = df.groupby(['Weather', 'Food'])
print("WF Groups", wf_group.groups)

WF Groups {('cold', 'icecream'): Int64Index([2, 4], dtype='int64'), ('cold', 'soup'): Int64Index([0, 6], dtype='int64'), ('hot', 'chocolate'): Int64Index([3], dtype='int64'), ('hot', 'icecream'): Int64Index([5], dtype='int64'), ('hot', 'soup'): Int64Index([1], dtype='int64')}


In [27]:
print("WF Aggregated\n", wf_group.agg([np.mean, np.median]))

WF Aggregated
                   Number            Price          
                    mean median      mean    median
Weather Food                                       
cold    icecream       8      8  4.440063  4.440063
        soup           8      8  2.163119  2.163119
hot     chocolate      8      8  5.986585  5.986585
        icecream       8      8  1.559945  1.559945
        soup           8      8  9.507143  9.507143


# Concatenating and appending DataFrames

In [28]:
print("df :3\n", df[:3])

df :3
        Food  Number     Price Weather
0      soup       8  3.745401    cold
1      soup       8  9.507143     hot
2  icecream       8  7.319939    cold


In [29]:
print("Concat Back together\n", pd.concat([df[:3], df[3:]]))

Concat Back together
         Food  Number     Price Weather
0       soup       8  3.745401    cold
1       soup       8  9.507143     hot
2   icecream       8  7.319939    cold
3  chocolate       8  5.986585     hot
4   icecream       8  1.560186    cold
5   icecream       8  1.559945     hot
6       soup       8  0.580836    cold


In [30]:
print("Appending rows\n", df[:3].append(df[5:]))

Appending rows
        Food  Number     Price Weather
0      soup       8  3.745401    cold
1      soup       8  9.507143     hot
2  icecream       8  7.319939    cold
5  icecream       8  1.559945     hot
6      soup       8  0.580836    cold


# joining DataFrames

In [32]:
dests = pd.read_csv('dest.csv')
print("Dests\n", dests)

tips = pd.read_csv('tips.csv')
print("Tips\n", tips)

print("Merge() on key\n", pd.merge(dests, tips, on='EmpNr'))
print("Dests join() tips\n", dests.join(tips, lsuffix='Dest', rsuffix='Tips'))

print("Inner join with merge()\n", pd.merge(dests, tips, how='inner'))
print("Outer join\n", pd.merge(dests, tips, how='outer'))

Dests
    EmpNr       Dest
0      5  The Hague
1      3  Amsterdam
2      9  Rotterdam
Tips
    EmpNr  Amount
0      5    10.0
1      9     5.0
2      7     2.5
Merge() on key
    EmpNr       Dest  Amount
0      5  The Hague    10.0
1      9  Rotterdam     5.0
Dests join() tips
    EmpNrDest       Dest  EmpNrTips  Amount
0          5  The Hague          5    10.0
1          3  Amsterdam          9     5.0
2          9  Rotterdam          7     2.5
Inner join with merge()
    EmpNr       Dest  Amount
0      5  The Hague    10.0
1      9  Rotterdam     5.0
Outer join
    EmpNr       Dest  Amount
0      5  The Hague    10.0
1      3  Amsterdam     NaN
2      9  Rotterdam     5.0
3      7        NaN     2.5


# Handlng missing Values

In [34]:
df = pd.read_csv('WHO_first9cols.csv')
# Select first 3 rows of country and Net primary school enrolment ratio male (%)
df = df[['Country', df.columns[-2]]][:2]
print("New df\n", df)
print("Null Values\n", pd.isnull(df))
print("Total Null Values\n", pd.isnull(df).sum())
print("Not Null Values\n", df.notnull())
print("Last Column Doubled\n", 2 * df[df.columns[-1]])
print("Last Column plus NaN\n", df[df.columns[-1]] + np.nan)
print("Zero filled\n", df.fillna(0))

New df
        Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                          NaN
1      Albania                                         94.0
Null Values
    Country  Net primary school enrolment ratio male (%)
0    False                                         True
1    False                                        False
Total Null Values
 Country                                        0
Net primary school enrolment ratio male (%)    1
dtype: int64
Not Null Values
    Country  Net primary school enrolment ratio male (%)
0     True                                        False
1     True                                         True
Last Column Doubled
 0      NaN
1    188.0
Name: Net primary school enrolment ratio male (%), dtype: float64
Last Column plus NaN
 0   NaN
1   NaN
Name: Net primary school enrolment ratio male (%), dtype: float64
Zero filled
        Country  Net primary school enrolment ratio male (%)
0  Afghanistan                

# dealing with dates

In [35]:
print("Date range", pd.date_range('1/1/1900', periods=42, freq='D'))

Date range DatetimeIndex(['1900-01-01', '1900-01-02', '1900-01-03', '1900-01-04',
               '1900-01-05', '1900-01-06', '1900-01-07', '1900-01-08',
               '1900-01-09', '1900-01-10', '1900-01-11', '1900-01-12',
               '1900-01-13', '1900-01-14', '1900-01-15', '1900-01-16',
               '1900-01-17', '1900-01-18', '1900-01-19', '1900-01-20',
               '1900-01-21', '1900-01-22', '1900-01-23', '1900-01-24',
               '1900-01-25', '1900-01-26', '1900-01-27', '1900-01-28',
               '1900-01-29', '1900-01-30', '1900-01-31', '1900-02-01',
               '1900-02-02', '1900-02-03', '1900-02-04', '1900-02-05',
               '1900-02-06', '1900-02-07', '1900-02-08', '1900-02-09',
               '1900-02-10', '1900-02-11'],
              dtype='datetime64[ns]', freq='D')


In [36]:
import sys
try:
   print("Date range", pd.date_range('1/1/1677', periods=4, freq='D'))
except:
   etype, value, _ = sys.exc_info()
   print("Error encountered", etype, value)

Error encountered <class 'pandas._libs.tslib.OutOfBoundsDatetime'> Out of bounds nanosecond timestamp: 1677-01-01 00:00:00


In [37]:
offset = pd.DateOffset(seconds=2 ** 33/10 ** 9)
mid = pd.to_datetime('1/1/1970')
print("Start valid range", mid - offset)
print("End valid range", mid + offset)

Start valid range 1969-12-31 23:59:51.410065
End valid range 1970-01-01 00:00:08.589935


In [38]:
print("With format", pd.to_datetime(['19021112', '19031230'], format='%Y%m%d'))

With format DatetimeIndex(['1902-11-12', '1903-12-30'], dtype='datetime64[ns]', freq=None)


In [39]:
print("Illegal date", pd.to_datetime(['1902-11-12', 'not a date']) )

ValueError: Unknown string format

In [40]:
print("Illegal date coerced", pd.to_datetime(['1902-11-12', 'not a date'], errors='coerce'))

Illegal date coerced DatetimeIndex(['1902-11-12', 'NaT'], dtype='datetime64[ns]', freq=None)


# Pivot Tables

In [41]:
seed(42)
N = 7
df = pd.DataFrame({
   'Weather' : ['cold', 'hot', 'cold', 'hot',
   'cold', 'hot', 'cold'],
   'Food' : ['soup', 'soup', 'icecream', 'chocolate',
   'icecream', 'icecream', 'soup'],
   'Price' : 10 * rand(N), 'Number' : randint(1, 9)})

In [42]:
print("DataFrame\n", df)

DataFrame
         Food  Number     Price Weather
0       soup       8  3.745401    cold
1       soup       8  9.507143     hot
2   icecream       8  7.319939    cold
3  chocolate       8  5.986585     hot
4   icecream       8  1.560186    cold
5   icecream       8  1.559945     hot
6       soup       8  0.580836    cold


In [43]:
print(pd.pivot_table(df, columns=['Food'], aggfunc=np.sum))

Food    chocolate   icecream      soup
Number   8.000000  24.000000  24.00000
Price    5.986585  10.440071  13.83338


In [51]:
import datetime
import pandas as pd
import pandas_datareader.data as web
## !!! here it is 'pandas_datareader' rather than 'pandas-datareader'

In [55]:
start = datetime.datetime(2016, 1, 1) # or start = '1/1/2016'
end = datetime.date.today()
prices = web.DataReader('F', 'morningstar', start, end)
print(prices.head())  # print first rows of the prices data

                   Close   High    Low   Open    Volume
Symbol Date                                            
F      2016-01-01  14.09  14.09  14.09  14.09         0
       2016-01-04  13.97  14.00  13.75  13.87  38618524
       2016-01-05  13.72  14.00  13.51  13.97  50266484
       2016-01-06  13.11  13.56  13.05  13.56  61263777
       2016-01-07  12.70  13.04  12.60  12.90  57838021
