In [20]:
import numpy as np
import pandas as pd

In [21]:
a = pd.Series([1,2,3,4,5], name='myname')

In [22]:
#a series of languages by their popularity in 2014
pop = pd.Series([100,99.2,90,88.4,85.3,80,75,73,69,65],
          index=['Java','Python','C','C++','C#','Javascript','Scala','Ruby','Perl','Go'])

In [23]:
pop[:5]

Java      100.0
Python     99.2
C          90.0
C++        88.4
C#         85.3
dtype: float64

In [24]:
pop.index

Index(['Java', 'Python', 'C', 'C++', 'C#', 'Javascript', 'Scala', 'Ruby',
       'Perl', 'Go'],
      dtype='object')

In [25]:
pop.values

array([100. ,  99.2,  90. ,  88.4,  85.3,  80. ,  75. ,  73. ,  69. ,
        65. ])

In [26]:
pop['Java':'C++']

Java      100.0
Python     99.2
C          90.0
C++        88.4
dtype: float64

In [27]:
#you can explicitly tell pandas whether you are using numeric or non-numeric indexing like so
print(pop.iloc[0:2])
print(pop.loc['Java':'C++'])

Java      100.0
Python     99.2
dtype: float64
Java      100.0
Python     99.2
C          90.0
C++        88.4
dtype: float64


In [28]:
# Of course, you can also use advanced indexing. For instance, a boolean mask.
pop[pop > 80]

Java      100.0
Python     99.2
C          90.0
C++        88.4
C#         85.3
dtype: float64

In [29]:
# Another way to create this series is from a Python dictionary.
# Indeed, sometimes it's useful to think of Pandas series as akin to dictionaries rather than NumPy arrays.
# let's re-write the series in the style of a dict
#a series of languages by their popularity in 2015
pop_another = pd.Series({'Java':100,'C':99.2,'Python':90,'C#':88.4,
                         'C++':85.3, 'Javascript':80, 'Ruby':75, 'Scala':73,
                        'Perl':69, 'Go':65}, name="myseries")
pop_another

Java          100.0
C              99.2
Python         90.0
C#             88.4
C++            85.3
Javascript     80.0
Ruby           75.0
Scala          73.0
Perl           69.0
Go             65.0
Name: myseries, dtype: float64

In [30]:
# Pandas DataFrames extend NumPy two-dimensional arrays by giving labels to the columns
# and if you provide an explicit index, also to the rows.
# I will show you several things about DataFrames:

* how to create them from Panda Series, Python dicts, and NumPy arrays
* how to work with Indexes and columns
* how to perform database-style operation between DataFrames, such as joins.

In [31]:
#a series of languages by their popularity in 2014
pop2014 = pd.Series([100,99.2,90,88.4,85.3,80,75,73,69,65],
          index=['Java','Python','C','C++','C#','Javascript','Scala','Ruby','Perl','Go'])
#a series of languages by their popularity in 2015
pop2015 = pd.Series({'Java':100,'C':99.2,'Python':90,'C#':88.4,
                         'C++':85.3, 'Javascript':80, 'Ruby':75, 'Scala':73,
                        'Perl':69, 'Go':65})

In [32]:
# From two series, we can create a DataFrame. The indexes will be matched automatically. 
# That's the kind of magic that Pandas does for us.
# We included the two series in a Python dict, so the keys will be used as column names.
# So I call Pandas DataFrame and give it a Python dict with key 2014 for the first series
# and 2015, obviously, for the second.

In [33]:
twoyears = pd.DataFrame({'2014':pop2014, '2015':pop2015})
# Pandas works with the Python notebook to provides a nice, tabular representation for the DataFrame.
twoyears

Unnamed: 0,2014,2015
C,90.0,99.2
C#,85.3,88.4
C++,88.4,85.3
Go,65.0,65.0
Java,100.0,100.0
Javascript,80.0,80.0
Perl,69.0,69.0
Python,99.2,90.0
Ruby,73.0,75.0
Scala,75.0,73.0


In [39]:
# We can do many things with the DataFrame.
# For instance, we can sort it using the values in one of the columns.
# We'll call the Pandas method, sort, in this case, on the column 2015. We want the largest values at the top,
# so we specify that the sort order should be descending.
twoyears = twoyears.sort_values('2015',ascending=False)
# We can do numerical operations on entire columns and store the result in a new column.
# For instance, we can compute the average computer language popularity across 2014 and 2015.
# And we can assign it to a new column, avg.
twoyears['avg'] = 0.5*(twoyears['2015'] + twoyears['2014'])
twoyears

Unnamed: 0,2014,2015,avg
Java,100.0,100.0,100.0
C,90.0,99.2,94.6
Python,99.2,90.0,94.6
C#,85.3,88.4,86.85
C++,88.4,85.3,86.85
Javascript,80.0,80.0,80.0
Ruby,73.0,75.0,74.0
Scala,75.0,73.0,74.0
Perl,69.0,69.0,69.0
Go,65.0,65.0,65.0


In [56]:
# Since Pandas is built on top of NumPy, there's a NumPy array inside every DataFrame.
# We can extract it by asking for the attribute, values.
print(twoyears.values)
# We can similarly extract the index and the names of the columns.
print(twoyears.index)
print(twoyears.keys())# names of columns
# Indexing the DataFrame with brackets naturally returns a column.
print(twoyears['2015'])
# If we want to select a subset of rows from the DataFrame, it's best to use the iloc or loc indexing objects.
print(twoyears.loc['Java':'C++'])
print(twoyears.iloc[0:2])
twoyears.iloc[0:2]['2015']

[[100.   100.   100.  ]
 [ 90.    99.2   94.6 ]
 [ 99.2   90.    94.6 ]
 [ 85.3   88.4   86.85]
 [ 88.4   85.3   86.85]
 [ 80.    80.    80.  ]
 [ 73.    75.    74.  ]
 [ 75.    73.    74.  ]
 [ 69.    69.    69.  ]
 [ 65.    65.    65.  ]]
Index(['Java', 'C', 'Python', 'C#', 'C++', 'Javascript', 'Ruby', 'Scala',
       'Perl', 'Go'],
      dtype='object')
Index(['2014', '2015', 'avg'], dtype='object')
Java          100.0
C              99.2
Python         90.0
C#             88.4
C++            85.3
Javascript     80.0
Ruby           75.0
Scala          73.0
Perl           69.0
Go             65.0
Name: 2015, dtype: float64
         2014   2015     avg
Java    100.0  100.0  100.00
C        90.0   99.2   94.60
Python   99.2   90.0   94.60
C#       85.3   88.4   86.85
C++      88.4   85.3   86.85
       2014   2015    avg
Java  100.0  100.0  100.0
C      90.0   99.2   94.6


Java    100.0
C        99.2
Name: 2015, dtype: float64

In [62]:
# There are more ways to make a DataFrame. For instance, we could use a Python dict to specify every row as a dict item
# and then give Pandas DataFrame a list of such dictionaries.
presidents = pd.DataFrame([
    {'name':'Barack Obama','inauguration':2009,'birthyear':1961},
    {'name':'George W. Bush','inauguration':2001,'birthyear':1946},
    {'name':'Bill Clinton','inauguration':1993,'birthyear':1946},
    {'name':'George H. W. Bush','inauguration':1989,'birthyear':1924}
])
presidents

Unnamed: 0,birthyear,inauguration,name
0,1961,2009,Barack Obama
1,1946,2001,George W. Bush
2,1946,1993,Bill Clinton
3,1924,1989,George H. W. Bush


In [71]:
# We can choose one of the columns to be used as the index. We do this with the Pandas set_index function.
presidents_indexes = presidents.set_index('name')
print(presidents_indexes)
# If we do this, we'll be able to ask explicitly, for instance, for Bill Clinton's inauguration year.
print(presidents_indexes.loc['Bill Clinton']['inauguration'])
#Remember that we need the .loc object to index rows. This gets us the entire record for Clinton.
print(presidents_indexes.loc['Bill Clinton'])
# Then we can choose the right column.
print(presidents_indexes.loc['Bill Clinton']['inauguration'])
# Or we could select the column first and then index the President we want.
print(presidents_indexes['inauguration'].loc['Bill Clinton'])

                   birthyear  inauguration
name                                      
Barack Obama            1961          2009
George W. Bush          1946          2001
Bill Clinton            1946          1993
George H. W. Bush       1924          1989
1993
birthyear       1946
inauguration    1993
Name: Bill Clinton, dtype: int64
1993
1993


In [72]:
# Pandas implements several operations common in relational databases, such as joins.
# I will only show you an example of the simplest type of join, a one-to-one operation.
# In Pandas, joins are performed with the merge function.
# First, however, I need another table of data to combine with my table of Presidents.
# For instance, a table that describes their fathers. So presidents_fathers.
presidents_fathers = pd.DataFrame([
    {'son':'Barack Obama', 'father':'Barack Obama, Sr.'},
    {'son':'George W. Bush', 'father':'George H. W. Bush'},
    {'son':'George H. W. Bush', 'father':'Prescott Bush'}
])

In [None]:
# In a join, which we do with Pandas merge, we combine two tables by matching values between two columns.
# In this case, we combine presidents and their fathers using the column, name, for the first DataFrame
# and the column, son, for the second.
# We specify these columns with the left_on and right_on arguments to merge.
# The resulting table has a redundant column that we can drop.