In [136]:
import numpy as np
import pandas as pd

ModuleNotFoundError: No module named 'pandas_datareader'

In [3]:
#Series is a one-dimensional array-like object containing a sequence of values (of similar types to Numpy) and a associated array of data labels, called index
#by default, the index goes from 0 to len-1
obj = pd.Series([2, 5, 6, 9])
obj

0    2
1    5
2    6
3    9
dtype: int64

In [4]:
#get values
obj.values

array([2, 5, 6, 9], dtype=int64)

In [5]:
#get index
obj.index

RangeIndex(start=0, stop=4, step=1)

In [11]:
#you can parse an array of the same length as the index of the series
obj = pd.Series([2, 5, 6, 9], index = ['a', 'b', 'c', 'b'])
obj

a    2
b    5
c    6
b    9
dtype: int64

In [12]:
#here is how to get the value an index (notice that values with the same index will appear at the same time
obj['b']

b    5
b    9
dtype: int64

In [13]:
#you can also parse an array of indices to get multiple values
obj[['a', 'b']]

a    2
b    5
b    9
dtype: int64

In [16]:
#check whether or not an index is in a 
#REMEMBER: you cannot use the same syntax to check for value in the series
'a' in obj

True

In [17]:
#Series is, in many ways, similar to a ordered, finite length dictionary
#Here is how to convert a dictionary to a Panda series
sdata = {'Ohio': 35000, 'Pennsylvania': 48000, 'New York': 30400, 'Wisconsin':90330}
obj2 = pd.Series(sdata)
obj2

New York        30400
Ohio            35000
Pennsylvania    48000
Wisconsin       90330
dtype: int64

In [22]:
#you can reorder the series above, notice that any index without a specified value will be typed NaN (not a number)
#since Pennsylvania is not in the index list, it is excluded from the Series object
states = ['Ohio', 'New York', 'Wisconsin', 'Mississippi']
obj3 = pd.Series(sdata, index = states)
obj3

Ohio           35000.0
New York       30400.0
Wisconsin      90330.0
Mississippi        NaN
dtype: float64

In [23]:
#Series will automatically align by index in arithmatic operations, this is similar to join operation in database
#notice that any index with NaN will remain NaN in arithmatic operations
obj2 + obj3

Mississippi          NaN
New York         60800.0
Ohio             70000.0
Pennsylvania         NaN
Wisconsin       180660.0
dtype: float64

In [24]:
#both the Series object and its index has attribute called 'name'
obj2.name = 'Population'
obj2.index.name = 'State'
obj2

State
New York        30400
Ohio            35000
Pennsylvania    48000
Wisconsin       90330
Name: Population, dtype: int64

In [28]:
#DataFrame: represents a rectangular table of data and contains ordered collection of columns, each of which can be different value type (numeric, string, boolean, etc.)
#The DataFrame has both a row and column index
data = {'State': ['Ohio', 'Ohio', 'Nevada', 'Pennsylvania'],
         'year': ['2000', '2001', '2002', '2003'],
         'population': [13042, 43242, 50659, 85002]}
dframe = pd.DataFrame(data, columns=['year', 'State', 'population'])
dframe

Unnamed: 0,year,State,population
0,2000,Ohio,13042
1,2001,Ohio,43242
2,2002,Nevada,50659
3,2003,Pennsylvania,85002


In [30]:
#if you parse a column that is not contained in the dictionary, it will appear with missing values in the result
frame2 = pd.DataFrame(data, columns = ['year', 'State', 'population', 'debt'])
frame2

Unnamed: 0,year,State,population,debt
0,2000,Ohio,13042,
1,2001,Ohio,43242,
2,2002,Nevada,50659,
3,2003,Pennsylvania,85002,


In [31]:
#A column in a DataFrame can be retrieved similarly to a dictionary
frame2['State']

0            Ohio
1            Ohio
2          Nevada
3    Pennsylvania
Name: State, dtype: object

In [33]:
#the column can also be retrieved as an attribute
#frame2[column] works for any column name, but frame2.column only works when the column name is a valid Python variable name
frame2.State

0            Ohio
1            Ohio
2          Nevada
3    Pennsylvania
Name: State, dtype: object

In [48]:
#Rows can be retrieved by position or name with the special loc attribute
#frame2.index = ['one', 'two', 'three', 'four']
frame2.loc['two']

year           2001
State          Ohio
population    43242
debt            NaN
Name: two, dtype: object

In [49]:
#column values can be modified by assignment of a scalar or an array (the length of said array must match the number of rows)
frame2.debt = 16.5
frame2

Unnamed: 0,year,State,population,debt
one,2000,Ohio,13042,16.5
two,2001,Ohio,43242,16.5
three,2002,Nevada,50659,16.5
four,2003,Pennsylvania,85002,16.5


In [53]:
frame2.debt = [4, 6, 2, 9]
frame2

Unnamed: 0,year,State,population,debt
one,2000,Ohio,13042,4
two,2001,Ohio,43242,6
three,2002,Nevada,50659,2
four,2003,Pennsylvania,85002,9


In [56]:
#if you are assigning a series to a column, it will work like join operation
sr1 = pd.Series({'one' : 4, 'five': 5, 'four': 9, 'three': 12, 'six': 14})
frame2.debt = sr1
frame2

Unnamed: 0,year,State,population,debt
one,2000,Ohio,13042,4.0
two,2001,Ohio,43242,
three,2002,Nevada,50659,12.0
four,2003,Pennsylvania,85002,9.0


In [64]:
#assigning a column that doesn't exist will create a new column, but you cannot use frame2.columnname syntax because it has yet existed
frame2['dafuq'] = sr1
frame2

Unnamed: 0,year,State,population,debt,dafuq
one,2000,Ohio,13042,4.0,4.0
two,2001,Ohio,43242,,
three,2002,Nevada,50659,12.0,12.0
four,2003,Pennsylvania,85002,9.0,9.0


In [65]:
#delete a column
del frame2['dafuq']
frame2.columns

Index(['year', 'State', 'population', 'debt'], dtype='object')

In [67]:
#Another form of data for DataFrame is nested dict. If a nested dict is passed to a DataFrame, it will consider the outer dict as the columns, and inner dict as rows
pop = {'Nevada': {2001: 2.4, 2002: 4.9, 2003: 1.6, 2004: 3.0}, 
       'Pennsylvania': {2001: 2.9, 2002: 4.1, 2003: 1.8, 2004: 5.0}, 
       'Ohio': {2001: 3.1, 2002: 4.5, 2003: 1.5, 2004: 7.8}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio,Pennsylvania
2001,2.4,3.1,2.9
2002,4.9,4.5,4.1
2003,1.6,1.5,1.8
2004,3.0,7.8,5.0


In [68]:
#you can transpose this table
frame3.T

Unnamed: 0,2001,2002,2003,2004
Nevada,2.4,4.9,1.6,3.0
Ohio,3.1,4.5,1.5,7.8
Pennsylvania,2.9,4.1,1.8,5.0


In [70]:
#if a DataFrame's index and columns have their name attribute set, they will also be displayed
frame3.index.name = 'Year'
frame3.columns.name = 'State'
frame3

State,Nevada,Ohio,Pennsylvania
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,2.4,3.1,2.9
2002,4.9,4.5,4.1
2003,1.6,1.5,1.8
2004,3.0,7.8,5.0


In [73]:
#pandas's Index Object are responsible for holding the axis labels and other metadata (like the axis name or names).
#Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index
#Index objects are immutable and therefore cannot be modified by user  
obj = pd.Series(range(3), index= ['one', 'two', 'three'])
Idx = obj.index
Idx

Index(['one', 'two', 'three'], dtype='object')

In [74]:
#In addition to being array-like, an Index also behaves like a fixed-length set:
#But unlike Python set, a panda Index can contain duplicate labels
frame3.columns

Index(['Nevada', 'Ohio', 'Pennsylvania'], dtype='object', name='State')

In [78]:
#reindex method rearranges the data according to the new order, with any index not presented before filled with missing value (NaN)
#keep in mind that this creates a copy of the original DataFrame, so it is not modified
frame3 = frame3.reindex([2004, 2002, 2003, 2001, 2000])
frame3

State,Nevada,Ohio,Pennsylvania
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004,3.0,7.8,5.0
2002,4.9,4.5,4.1
2003,1.6,1.5,1.8
2001,2.4,3.1,2.9
2000,,,


In [83]:
#ffill method lets you fill in places with missing values by forward the value of the above row
obj = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 5])
obj = obj.reindex(range(9), method = 'ffill')
obj

0      blue
1      blue
2    purple
3    purple
4    purple
5    yellow
6    yellow
7    yellow
8    yellow
dtype: object

In [87]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                    index=['a','b','f'], columns = ['Ohio', 'Pennsylvania', 'New York'])
frame

Unnamed: 0,Ohio,Pennsylvania,New York
a,0,1,2
b,3,4,5
f,6,7,8


In [90]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Pennsylvania,New York
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,,,
d,,,


In [91]:
#reindex can also alter columns
frame2= frame2.reindex(columns=['Ohio', 'Utah', 'New York'])
frame2

Unnamed: 0,Ohio,Utah,New York
a,0.0,,2.0
b,3.0,,5.0
c,,,
d,,,


In [96]:
#drop an entry
#you can parse a single element or an array to drop multiple entries
#this creates a copy of the original DataFrame, not a view
frame2 = frame2.drop('d')
frame2

Unnamed: 0,Ohio,Utah,New York
a,0.0,,2.0
b,3.0,,5.0


In [98]:
#you can also drop a column
#you must specify the column axis (1) if you want to drop it
frame2 = frame2.drop('Utah', axis=1)
frame2

Unnamed: 0,Ohio,New York
a,0.0,2.0
b,3.0,5.0


In [99]:
#you can also drop an entry in-place by explicitly announce it
frame2.drop('New York', axis=1, inplace=True)
frame2

Unnamed: 0,Ohio
a,0.0
b,3.0


In [101]:
#you can also use index labels in slicing
obj = pd.Series (range(6), index=['a', 'b', 'd', 'f', 'c', 'e'])
obj['b': 'c']

b    1
d    2
f    3
c    4
dtype: int64

In [104]:
#Selection with loc and iloc
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [105]:
#Sellect a row
data.loc['Colorado']

one      4
two      5
three    6
four     7
Name: Colorado, dtype: int32

In [106]:
#Select specific columns from said row
data.loc['Colorado', ['two', 'four']]

two     5
four    7
Name: Colorado, dtype: int32

In [108]:
#Select rows and columns using index integer
data.iloc[1, [1, 3]]

two     5
four    7
Name: Colorado, dtype: int32

In [110]:
#Slicing works with loc and iloc as well
data.loc[:'Utah', ['two', 'four']]

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11


In [118]:
#Operations between a DataFrame and a Series
obj = pd.Series([2, 5, 6, 9], index=['oneee', 'tfdg', 'asfd', 'fourr'])
obj2 = pd.DataFrame({'oneee': [2, 4, 5,6], 'tfdg': [3,5, 1, 4], 'asfd': [12, 10, 4, 4], 'fourr': range(4)}, index=['one', 'two', 'three', 'four'])
obj2 - obj

Unnamed: 0,asfd,fourr,oneee,tfdg
one,6,-9,0,-2
two,4,-8,2,0
three,-2,-7,3,-4
four,-2,-6,4,-1


In [120]:
#Pandas ufuncs (element-wise operations)
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame.abs()

Unnamed: 0,b,d,e
Utah,0.774401,1.555731,2.176036
Ohio,1.007941,0.274884,0.815863
Texas,0.708354,0.275971,1.159676
Oregon,1.040725,1.160751,1.283872


In [122]:
#lambda function
f = lambda x: x.max() - x.min()
frame.apply(f)

b    0.332371
d    2.716481
e    3.459908
dtype: float64

In [123]:
#sort_index in DataFrame
frame.sort_index()

Unnamed: 0,b,d,e
Ohio,1.007941,-0.274884,-0.815863
Oregon,1.040725,-1.160751,1.283872
Texas,0.708354,0.275971,1.159676
Utah,0.774401,1.555731,-2.176036


In [124]:
#sort_index by columns
frame.sort_index(axis = 1)

Unnamed: 0,b,d,e
Utah,0.774401,1.555731,-2.176036
Ohio,1.007941,-0.274884,-0.815863
Texas,0.708354,0.275971,1.159676
Oregon,1.040725,-1.160751,1.283872


In [125]:
#Sorting in descending order
frame.sort_index(axis =1, ascending=False)

Unnamed: 0,e,d,b
Utah,-2.176036,1.555731,0.774401
Ohio,-0.815863,-0.274884,1.007941
Texas,1.159676,0.275971,0.708354
Oregon,1.283872,-1.160751,1.040725


In [128]:
#sort by values (by choosing a particular column as sort key)
#remember, no matter what your order is, values NaN will go toward the end of the sorted sequence
frame.sort_values(by='d')

Unnamed: 0,b,d,e
Oregon,1.040725,-1.160751,1.283872
Ohio,1.007941,-0.274884,-0.815863
Texas,0.708354,0.275971,1.159676
Utah,0.774401,1.555731,-2.176036


In [129]:
#you can also sort by multiple columns, which will prioritize sorting the first column first then the second (in the case of same values)
frame.sort_values(by=['b', 'e'])

Unnamed: 0,b,d,e
Texas,0.708354,0.275971,1.159676
Utah,0.774401,1.555731,-2.176036
Ohio,1.007941,-0.274884,-0.815863
Oregon,1.040725,-1.160751,1.283872


In [131]:
#Ranks assign ranks from 1 to the number of valid data points in an array
#the rank methods for Series and DataFrame are the place to look, by default rank breaks ties by assigning each group the mean rank:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
#notice that 7s and 4s have ties so their rank are the mean rank of each number

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [135]:
#Rank can also be assigned according to the order in which they're observed in the data:
obj.rank(method='first')
#label 0 precedes label 2 in this series, therefore 2 is 7.0 and 0 is 6.0

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64