In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Pandas Series
#### Create a Series

In [2]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [3]:
# create a Series through a one-dimension array
# ndarray have the same dtype, while Series elements can have different dtype
arr = np.array([1,3,5, np.NaN, 10])   # default index 0, 1, 2, ...,n-1
series1 = pd.Series(arr)
series1

0     1.0
1     3.0
2     5.0
3     NaN
4    10.0
dtype: float64

In [4]:
# create a Series through a dictionary
series2 = pd.Series({
    'a': 10,    # key becomes an index
    'b': 20,    # value becomes a value of Series 
    'c': 30, 
})
print(series2)
print(series2.keys())
print('c' in series2)

a    10
b    20
c    30
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
True


In [5]:
# create a series through a list
s = pd.Series([1,2,3,4,5])
s[6] = 10
print(s.index)
print(s.values)

Int64Index([0, 1, 2, 3, 4, 6], dtype='int64')
[ 1  2  3  4  5 10]


In [6]:
revenues = pd.Series([5555, 7000, 1980])
print(revenues.values)
print(revenues.index)
print(revenues)

[5555 7000 1980]
RangeIndex(start=0, stop=3, step=1)
0    5555
1    7000
2    1980
dtype: int64


In [16]:
# create a series through an array
ser = pd.Series(np.arange(3.))
ser
#ser[-1]

0    0.0
1    1.0
2    2.0
dtype: float64

In [15]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]
ser2

2.0

a    0.0
b    1.0
c    2.0
dtype: float64

In [20]:
city_revenues = pd.Series([4200, 8000, 6500,5800],
                         index = ["Armsterdam", "Toronto", "Tokyo", "Beijing"])
print(city_revenues)
print(city_revenues.Tokyo)

Armsterdam    4200
Toronto       8000
Tokyo         6500
Beijing       5800
dtype: int64
6500


In [21]:
# create a series through a dictionary
city_employee_count = pd.Series({"Armsterdam": 5, "Tokyo": 8, "Beijing": 14})
print(city_employee_count)
print(city_employee_count.keys())
"Tokyo" in city_employee_count

Armsterdam     5
Tokyo          8
Beijing       14
dtype: int64
Index(['Armsterdam', 'Tokyo', 'Beijing'], dtype='object')


True

In [31]:
# convert a series' index into dataframe's column
alist = list('abcdefghijklmnopqrstuvwxyz')
arr = np.arange(1, 27)
adict = dict(zip(alist,arr))
ser = pd.Series(adict)

# convert ser into a dataframe type
df = ser.to_frame()
df
df.index

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7
h,8
i,9
j,10


Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='object')

In [22]:
# combine series into dataframe
city_data = pd.DataFrame({"revenue": city_revenues, 
                        "employee_count": city_employee_count})
print(city_data)
print(city_data.index)
print(city_data.columns)
print(city_data.values)
print(city_data.axes)
print(city_data.axes[0])
print(city_data.axes[1])

            revenue  employee_count
Armsterdam     4200             5.0
Beijing        5800            14.0
Tokyo          6500             8.0
Toronto        8000             NaN
Index(['Armsterdam', 'Beijing', 'Tokyo', 'Toronto'], dtype='object')
Index(['revenue', 'employee_count'], dtype='object')
[[4.2e+03 5.0e+00]
 [5.8e+03 1.4e+01]
 [6.5e+03 8.0e+00]
 [8.0e+03     nan]]
[Index(['Armsterdam', 'Beijing', 'Tokyo', 'Toronto'], dtype='object'), Index(['revenue', 'employee_count'], dtype='object')]
Index(['Armsterdam', 'Beijing', 'Tokyo', 'Toronto'], dtype='object')
Index(['revenue', 'employee_count'], dtype='object')


In [23]:
further_city_data = pd.DataFrame(
    {"revenue": [7000, 3400], 
    "employee_count": [2, 3]},
    index = ["New York", "Barcelona"])
display(further_city_data)
display(city_data)

Unnamed: 0,revenue,employee_count
New York,7000,2
Barcelona,3400,3


Unnamed: 0,revenue,employee_count
Armsterdam,4200,5.0
Beijing,5800,14.0
Tokyo,6500,8.0
Toronto,8000,


In [24]:
# by default, concat() combines along axis = 0
all_city_data = pd.concat([city_data, further_city_data], sort= False)
all_city_data

Unnamed: 0,revenue,employee_count
Armsterdam,4200,5.0
Beijing,5800,14.0
Tokyo,6500,8.0
Toronto,8000,
New York,7000,2.0
Barcelona,3400,3.0


In [25]:
city_countries = pd.DataFrame({
    "country": ["Holland", "Japan", "Holland", "Canada", "Spain"],
    "capital": [1, 1, 0, 0, 0]},
    index = ["Armsterdam", "Tokyo", "Rotterdam", "Toronto", "Barcelona"]
)
city_countries
cities = pd.concat([all_city_data, city_countries], axis = 1, sort = False)
cities

Unnamed: 0,country,capital
Armsterdam,Holland,1
Tokyo,Japan,1
Rotterdam,Holland,0
Toronto,Canada,0
Barcelona,Spain,0


Unnamed: 0,revenue,employee_count,country,capital
Armsterdam,4200.0,5.0,Holland,1.0
Beijing,5800.0,14.0,,
Tokyo,6500.0,8.0,Japan,1.0
Toronto,8000.0,,Canada,0.0
New York,7000.0,2.0,,
Barcelona,3400.0,3.0,Spain,0.0
Rotterdam,,,Holland,0.0


In [None]:
# to combine only the cities that appear in both DataFrame objects, set join parameter to inner
cities = pd.concat([all_city_data, city_countries], axis = 1, join = "inner")
cities

In [None]:
countries = pd.DataFrame({
    "population_millions": [17, 127, 35],
    "continent": ["Europe", "Asis", "North America"]
}, index = ["Holland", "Japan", "Canada"])
display(countries)
display(pd.merge(cities, countries, left_on = "country", 
         right_index = True))
# only the cities where country is known and appears 
# in the joined DataFrame. merge() performs an inner join
# by default
display(pd.merge(cities, countries, left_on = "country",
        right_index = True, how = "left"))
# how = left to include all cities in the result

In [None]:
city_data.keys()
print("revenue" in city_data)
print("Armsterdam" in city_data)   # in is not relate to the index, but to the columns

In [None]:
print(city_revenues)
print(city_revenues["Toronto"])
print(city_revenues[1])

In [None]:
city_data.loc["Armsterdam":"Tokyo", "revenue"]

There are two ways to insert index to a Series.
one is indexing after the Series has been created.
The other is to index when create a Series

In [None]:
# create index when a Series is available
series1 = pd.Series([1,2,3,4])
series1.index = ['a', 'b', 'c', 'd']
series1

In [None]:
# create index when create a Series
series1 = pd.Series(np.array([1,2,4,3]), dtype = np.float64, index=['m', 'n', 'o', 'p'])
series1

In [None]:
# Index CAN BE duplicate
series2 = pd.Series(np.array([1,2,4,3, 77]), dtype = np.float64, index=['m', 'n', 'o', 'p', 'm'])
series2

In [None]:
# other ways to create Series
series3 = pd.Series(np.arange(5))
series3

s_values = np.arange(5)
s_index = np.arange(9, 4, -1)
series4 = pd.Series(s_values, index = s_index)
series4

# pd.Series(10) only create ONE entry. 
series5 = pd.Series(10, index = ['a', 'b','c'])
series5

##### Select value from Series

In [None]:
series1['m']    # index selection, 

In [None]:
series2['m']   # index can be duplicate

In [None]:
series2[2]   # location selection, automatically generated, coexist with self-defined index name

In [None]:
series2[-2]

In [None]:
series2[:]

In [None]:
series2[1:3]      # not include 3

In [None]:
series3 = pd.Series(np.array([1,2,4,3, 77]), dtype = np.float64, index=['m', 'n', 'o', 'p', 's'])
series3

In [None]:
series3['m':'p']    # include 'p'

In [None]:
series4 = pd.Series({
    'a': -1,
    'b': 2,
    'c': 3,
})
series4


Most of the array operations in numpy is retained in pd.Series.

In [None]:
series4[series4 > 1]

In [None]:
series5 = series4 + 1
series5

In [None]:
series4 / 10

In [None]:
np.exp(series4)

In [None]:
np.fabs(series4)

In [None]:
series4.median()

In [None]:
series4.get('c', 88)   # get value of index 'c' 

In [None]:
series4.get('d',99)    # get value of index 'd', if havent the index, return default 99
series4                # but not change the original series4

In [None]:
series5 = series4.copy()     # copy, updates on series5 have no impact on original series4
series5.index = ['aa', 'bb', 'cc']
series5

Missing Value

In [None]:
new_index = ['a','b', 'c', 'd']
series6 = pd.Series(series4, index = new_index)
series6          #NaN stands for missing value

In [None]:
pd.isnull(series5)  # return which one is NaN

In [None]:
pd.notnull(series5)  # return which one is not NaN

In [None]:
series5.fillna(0)  # fillna() replace NaN with a arbitary value

Index auto alignment when +/- operation

In [None]:
series6 = pd.Series({
    'idx1': 10,
    'idx2': 20,
    'idx3': 30
})
series7 = pd.Series({
    'idx4': 70,
    'idx2': 50,
    'idx3': 50
})
series6 + series7

# only 'idx2', 'idx3' are aligned in series6/7, 'idx1' and 'idx4' are NaN

In [None]:
print(series7.name)    #Series object and Series index object has 'name' attribute,None by default.

In [None]:
print(series7.index.name)

In [None]:
series7.name = "Series"
series7.index.name = "index"
series7.name
series7.index.name

In [None]:
help(pd.Series)