[View in Colaboratory](https://colab.research.google.com/github/sungreong/TIL/blob/master/pandas_example.ipynb)

# Refer
http://nbviewer.jupyter.org/github/pydata/pydata-book/blob/2nd-edition/ch05.ipynb#Indexing,-Selection,-and-Filtering

In [0]:

import pandas as pd
from pandas import Series , DataFrame
import numpy as np
import matplotlib.pyplot as plt
plt.rc("figure", figsize = (10, 6 ))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

# Series

In [2]:
obj = pd.Series( [ 4, 8 , -5 , 3 ])
obj

0    4
1    8
2   -5
3    3
dtype: int64

In [9]:
print( "'values'" , obj.values , "\n'index'" , obj.index)


'values' [ 4  8 -5  3] 
'index' RangeIndex(start=0, stop=4, step=1)


In [12]:
obj2 = pd.Series( [ 4 , 1, 2, 3] , index = ["d","b","c","a"])
obj2.index

Index(['d', 'b', 'c', 'a'], dtype='object')

In [25]:
obj2["a"]
obj2["d"] = 6
obj2[ ["c", "a", "d"]]

c    2
a    3
d    6
dtype: int64

In [29]:
obj2[ obj2 > 1  ]
obj2*2
np.exp( obj2 )

d    403.428793
b      2.718282
c      7.389056
a     20.085537
dtype: float64

In [33]:
## index select 체크 
"b" in obj2 
"e" in obj2
print( "b" in obj2 , "\n" , "e" in obj2)

True 
 False


In [46]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series( sdata )
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [47]:
## index를 사용한 select 

states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [48]:

# null 값 check 
pd.isnull(obj4)
obj.isnull()

# notnull 값 체크 
pd.notnull(obj4)
obj4.notnull()


California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [49]:
## 같은 인덱스끼리 더해지는 것을 확인 할 수 있음

obj3 + obj4


California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [56]:
# name 지정해주기 
obj4.name ="population"
obj4.index.name="state"
print(obj4 ,"\n")
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64 

Bob      4
Steve    8
Jeff    -5
Ryan     3
dtype: int64


## DataFrame

In [0]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [78]:
frame.head(3)
frame.describe()
frame.count()
frame.columns
frame.index
frame.values
frame.dtypes



pop      float64
state     object
year       int64
dtype: object

In [75]:
## columns 순서 바꾸기 
pd.DataFrame( data , columns =["year", "state","pop"])

Unnamed: 0,2,3,1


In [76]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])

frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [92]:
## column select method 
frame2["state"]
frame2.state 
frame2.loc[:, "state"]



all equal 


In [97]:
## row select method
frame2.loc["three" ]
frame2.loc["three" , ]


year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [112]:
## column create
frame2["debt"] = 16.5
frame2["debt"] = np.arange(6.)
## 2 
val = pd.Series( [ -1.2 , -1.5, 1.6] , index=["two", "four", "five"])
frame2["debt2"]= val
## 3 
frame2["eastern"] = frame2.state == "Ohio"
frame2

Unnamed: 0,year,state,pop,debt,debt2,eastern
one,2000,Ohio,1.5,0.0,,True
two,2001,Ohio,1.7,1.0,-1.2,True
three,2002,Ohio,3.6,2.0,,True
four,2001,Nevada,2.4,3.0,-1.5,False
five,2002,Nevada,2.9,4.0,1.6,False
six,2003,Nevada,3.2,5.0,,False


In [113]:
## remove column
del frame2["eastern"]
frame2.columns


AttributeError: ignored

In [119]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

frame3 = pd.DataFrame(pop)
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [117]:
## index를 이용한 select 및 줄 추가 
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [130]:
## same select
frame3["Ohio"][:-1] # 3개중 마지막꺼 빼고 뽑기
a= frame3["Ohio"][:2] # 3개중 2개 뽑기 
print(a)
a.dtypes
a.index

2000    1.5
2001    1.7
Name: Ohio, dtype: float64


Int64Index([2000, 2001], dtype='int64')

In [124]:
##  기존에 데이터를 활용하여 새로운 데이터 프레임 만들기
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [142]:
## index columns name 지정하기 
frame3.index.name ="year" 
frame3.columns.name = "state"
frame3
print("index", frame3.index ,"\ncolumns" , frame3.columns , "\nvalues\n" , frame3.values)



index Int64Index([2000, 2001, 2002], dtype='int64', name='year') 
columns Index(['Nevada', 'Ohio'], dtype='object', name='state') 
values
 [[nan 1.5]
 [2.4 1.7]
 [2.9 3.6]]


## Index Objects

In [144]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
index[1:]


'b'

In [154]:
labels = pd.Index( np.arange(3))
print(labels)
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2
obj2.index is labels

Int64Index([0, 1, 2], dtype='int64')


True

In [152]:
print(frame3)
'Ohio' in frame3.columns
2003 in frame3.index

state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6


False

In [153]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

# Essential Functionality

## Reindexing

In [162]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)
## reindex 
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
pd.Series( obj , index=['a', 'b', 'c', 'd', 'e'])

#pd.DataFrame( obj , index=['a', 'b', 'c', 'd', 'e'] )

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64


a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [165]:
obj3  = pd.Series( [ "blue", "purple", "yellow"] , index = [ 0, 2, 4])
obj3.reindex( range(8) , method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
6    yellow
7    yellow
dtype: object

In [169]:
## reindex row
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
frame.reindex(['a', 'b', 'c', 'd'], method="ffill")

   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,0,1,2
c,3,4,5
d,6,7,8


In [170]:
## reindex columns 
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [172]:
##  select row column and create 
frame.loc[ ['b', 'a', 'c', 'd'] , states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
b,,,
a,1.0,,2.0
c,4.0,,5.0
d,7.0,,8.0


## Dropping Entries from an Axis

In [185]:
## row drop
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop("c")
new_obj
obj.drop( [ "d", "c" ])
print(obj)
## inplace 를 활용해 영구적으로 떨구기 
obj.drop("c", inplace= True)
print(obj)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64


In [186]:
## DataFrame row  drop

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

## index를 활용해  drop
data.drop(['Colorado', 'Ohio'] ,axis= 0)
data.drop(['Colorado', 'Ohio'] ,axis= "index")

## DataFrame column  drop / axis = 1 or "columns"
data.drop("two" , axis = 1 )
data.drop(["two", "four"] , axis = "columns")



Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


## Indexing, Selection, and Filtering

In [203]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
## same
obj['b']
obj[2]

## 2:4
obj[2:4]
obj["b":"d"]
obj[[False , True, True, True]] 

## 특정 select  및 위치 
obj[["d","a","b"]]
obj[[3 , 0 , 1 ]]



## 특정 조건 만족하는 것만 뽑기  bool 로 뽑기 
obj[ obj < 2 ]

b    1.0
c    2.0
d    3.0
dtype: float64

In [206]:
## 특정 select 값 바꾸기 
obj["b":"d"] = 5 
obj

a    0.0
b    5.0
c    5.0
d    5.0
dtype: float64

In [219]:
## DataFrame  prefer .iloc and .loc 

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

data["two"]
data[["two","three"]]
data.loc[:, ["two", "three"]]
data.iloc[:, 2:4 ]
data.iloc[:, [2,3] ]
data.iloc[:, 1:-1 ]
data.iloc[ [1,2], 1:-1]


Unnamed: 0,two,three
Colorado,5,6
Utah,9,10


In [239]:
data[0:2]
data[ :2]

## index condition select 

print(data["three"] > 7 ) 
data[data["three"] > 7]



Ohio        False
Colorado    False
Utah         True
New York     True
Name: three, dtype: bool


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [240]:

data < 5
data[data < 5] = 0
data


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## Select with loc and iloc

In [241]:
data.loc['Colorado', ['two', 'three']]
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [249]:
data.loc[ :'Utah', "two"]

## data.three 로 index select도 만족하면서 iloc 가 
print(data.iloc[ : , :3])
data.iloc[ : , :3][data.three > 6]

          one  two  three
Ohio        0    0      0
Colorado    0    5      6
Utah        8    9     10
New York   12   13     14


Unnamed: 0,one,two,three
Utah,8,9,10
New York,12,13,14


## Integer Indexes

In [250]:
ser = pd.Series(np.arange(3.))
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [255]:
ser[:1]
ser.loc[:1]
ser.loc[:"b"]
ser.iloc[:1]


0    0.0
dtype: float64

## Arithmetic and Data Alignment

In [258]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])

## index matching 
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [262]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print(df1)
print(df2)
## column 과 index 매칭 후 +
df1+df2

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [265]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
print(df1)
print(df2)
## index 는 맞지만 columns 이 다르므로 NaN
df1 - df2

   A
0  1
1  2
   B
0  3
1  4


Unnamed: 0,A,B
0,,
1,,


## Arithmetic methods with fill values

In [268]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))

df2.loc[1, "b"] = np.nan
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [270]:
df1.add(df2 , fill_value = 0 )
df2.add(df1, fill_value= 0 )

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [272]:
## same 
1/ df1
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [275]:
print(df2.columns)
print(df1)
df1.reindex( columns = df2.columns , fill_value = 100)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0


Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,100
1,4.0,5.0,6.0,7.0,100
2,8.0,9.0,10.0,11.0,100


## Operations between DataFrame and Series

In [286]:
arr = np.arange( 12. ).reshape(( 3,4))

print(arr)
print(arr[: ,1])
print(arr[:, [1]])
arr[:, 1:3]

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
[1. 5. 9.]
[[1.]
 [5.]
 [9.]]


array([[ 1.,  2.],
       [ 5.,  6.],
       [ 9., 10.]])

In [287]:
## 열별로 계산 됨 
print(arr[0])
arr-arr[0]

[0. 1. 2. 3.]


array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [297]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

frame.iloc[0]
frame.loc["Utah"]

series = frame.iloc[0]
print(series.index)
series


Index(['b', 'd', 'e'], dtype='object')


b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [295]:
print(frame)
frame - series

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [296]:
## index eh 
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2


Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [301]:
series3 =  frame["d"]
print(series3)
print(series3.index)
frame.sub( series3 , axis = "index")

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64
Index(['Utah', 'Ohio', 'Texas', 'Oregon'], dtype='object')


Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [302]:
## Function Application and Mapping
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
np.abs(frame)


Unnamed: 0,b,d,e
Utah,0.822751,0.308537,0.707284
Ohio,0.091926,0.525274,0.938485
Texas,1.877533,0.623135,1.054589
Oregon,0.971341,0.712499,0.931871


In [306]:
print(frame)
sub = lambda x: x.max() - x.min()
## 행별로 defualt 행별로 
frame.apply(sub)
frame.apply(sub , axis = "index")
## 열별로 
frame.apply(sub , axis="columns")

               b         d         e
Utah    0.822751 -0.308537  0.707284
Ohio    0.091926 -0.525274 -0.938485
Texas  -1.877533  0.623135  1.054589
Oregon  0.971341 -0.712499 -0.931871


Utah      1.131288
Ohio      1.030410
Texas     2.932122
Oregon    1.903212
dtype: float64

In [312]:
def sub(x) : 
    return x.max() - x.min()

def f(x) : 
    return pd.Series( [ x.min() , x.max() , sub(x) ], index = ["min", "max", "sub"])
frame.apply(f)


Unnamed: 0,b,d,e
min,-1.877533,-0.712499,-0.938485
max,0.971341,0.623135,1.054589
sub,2.848875,1.335634,1.993073


In [313]:
frame.apply(f , axis="columns")

Unnamed: 0,min,max,sub
Utah,-0.308537,0.822751,1.131288
Ohio,-0.938485,0.091926,1.03041
Texas,-1.877533,1.054589,2.932122
Oregon,-0.931871,0.971341,1.903212


In [314]:
format = lambda x: "%.2f" % x 
frame.applymap( format )

Unnamed: 0,b,d,e
Utah,0.82,-0.31,0.71
Ohio,0.09,-0.53,-0.94
Texas,-1.88,0.62,1.05
Oregon,0.97,-0.71,-0.93


In [315]:
frame["e"].map(format)

Utah       0.71
Ohio      -0.94
Texas      1.05
Oregon    -0.93
Name: e, dtype: object

## Sorting and Ranking

In [316]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [318]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
## index sort 
frame.sort_index()


Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [319]:
## column sort
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [320]:
## 내림차순
frame.sort_index( axis = 1 , ascending= False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [325]:
## value sorting 
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
print( obj.sort_values( ascending= False ) )
print( obj.sort_values(  ) )

2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64


In [327]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values( by ="b")


Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [329]:
## by의 순서에 따라 sorting 순서가 바뀜
frame.sort_values( by =["a","b"])

frame.sort_values( by =["b","a"])

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7




---



In [334]:
## 순위로 따지기  7이 두개가 있으니 .5 default method = "mean" 이라서 
obj = pd.Series([7, -5, 7, 4, 2, 0, 4  ])
obj.rank()


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [335]:
## 먼저 나오면  순위가 빠른 것으로 
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [338]:
# Assign tie values the maximum rank in the group
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [345]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
print(frame)
print(" columns RANK")
frame.rank(axis='columns')


   a    b    c
0  0  4.3 -2.0
1  1  7.0  5.0
2  0 -3.0  8.0
3  1  2.0 -2.5
 columns RANK


Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [346]:
print("rows RANK")
frame.rank()

rows RANK


Unnamed: 0,a,b,c
0,1.5,3.0,2.0
1,3.5,4.0,3.0
2,1.5,1.0,4.0
3,3.5,2.0,1.0


## Axis Indexes with Duplicate Labels

In [350]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique


False

In [351]:
obj["a"]

a    0
a    1
dtype: int64

In [352]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
df.loc['b']

Unnamed: 0,0,1,2
b,1.539375,-1.764758,-0.636085
b,1.1448,-0.770633,0.209191


## Summarizing and Computing Descriptive Statistics

In [353]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [358]:
## columns 별로
df.sum()


a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [0]:
## 축이 열
df.sum(axis='columns')

In [359]:
df.mean( axis = "columns" , skipna= False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [363]:
print(df)
## 
df.idxmax( axis = 0 )

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


one    b
two    d
dtype: object

In [364]:
df.idxmax( axis = 1 )
df.idxmax( axis = "columns")

a    one
b    one
c    NaN
d    one
dtype: object

In [365]:
df.cumsum()
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [368]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [370]:
!pip install pandas-datareader


Collecting pandas-datareader
[?25l  Downloading https://files.pythonhosted.org/packages/cc/5c/ea5b6dcfd0f55c5fb1e37fb45335ec01cceca199b8a79339137f5ed269e0/pandas_datareader-0.7.0-py2.py3-none-any.whl (111kB)
[K    100% |████████████████████████████████| 112kB 4.1MB/s 
[?25hCollecting lxml (from pandas-datareader)
[?25l  Downloading https://files.pythonhosted.org/packages/03/a4/9eea8035fc7c7670e5eab97f34ff2ef0ddd78a491bf96df5accedb0e63f5/lxml-4.2.5-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K    100% |████████████████████████████████| 5.8MB 931kB/s 
Installing collected packages: lxml, pandas-datareader
Successfully installed lxml-4.2.5 pandas-datareader-0.7.0


In [379]:
cd sample_data


/content/sample_data


In [0]:
price = pd.read_csv('./california_housing_train.csv')

In [392]:
returns = price.pct_change()
returns.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
16995,8e-05,0.007448,0.625,0.55035,-0.059666,1.089862,0.973262,0.213936,0.463863
16996,8e-05,0.002711,-0.307692,0.05954,0.340102,0.316428,0.260163,0.068219,-0.290844
16997,0.000241,0.028262,-0.527778,0.139634,0.005682,0.041876,-0.019355,0.2039,0.311392
16998,0.0,-0.000956,0.117647,-0.001868,0.039548,0.043408,0.048246,-0.346914,-0.171815
16999,0.000402,-0.030144,1.736842,-0.318862,-0.456522,-0.379045,-0.435146,0.522806,0.102564


In [394]:
returns["total_rooms"].corr(returns["population"])
returns.total_rooms.corr(returns.population)

0.5847652192498696

In [396]:
returns.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,0.360663,-0.003579,0.003396,0.00376,0.012562,0.005949,-0.027941,-0.068158
latitude,0.360663,1.0,-0.028908,-0.011818,-0.008066,-0.029664,-0.017955,-0.13797,-0.290282
housing_median_age,-0.003579,-0.028908,1.0,0.005127,-0.018173,-0.007534,-0.022258,-0.027673,-0.029959
total_rooms,0.003396,-0.011818,0.005127,1.0,0.686381,0.584765,0.640172,0.073666,0.027961
total_bedrooms,0.00376,-0.008066,-0.018173,0.686381,1.0,0.799048,0.91257,0.046859,0.016233
population,0.012562,-0.029664,-0.007534,0.584765,0.799048,1.0,0.817101,0.090746,0.020313
households,0.005949,-0.017955,-0.022258,0.640172,0.91257,0.817101,1.0,0.066049,0.02588
median_income,-0.027941,-0.13797,-0.027673,0.073666,0.046859,0.090746,0.066049,1.0,0.458393
median_house_value,-0.068158,-0.290282,-0.029959,0.027961,0.016233,0.020313,0.02588,0.458393,1.0


In [398]:
returns.cov()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.167265e-09,1.990482e-07,-1.238886e-07,2e-06,1e-06,3e-06,1e-06,-5.572255e-07,-2e-06
latitude,1.990482e-07,0.0002609412,-0.0004731202,-0.002534,-0.001063,-0.003675,-0.002073,-0.001300961,-0.003081
housing_median_age,-1.238886e-07,-0.0004731202,1.0265,0.068952,-0.150248,-0.058536,-0.161135,-0.0163663,-0.019942
total_rooms,1.54013e-06,-0.002534125,0.06895225,176.202483,74.347421,59.526721,60.720273,0.5707988,0.243848
total_bedrooms,1.048323e-06,-0.001063209,-0.1502484,74.347421,66.586968,50.002531,53.209815,0.2232038,0.087026
population,3.291227e-06,-0.003674784,-0.05853619,59.526721,50.002531,58.809682,44.774574,0.4062226,0.102342
households,1.452214e-06,-0.00207252,-0.1611354,60.720273,53.209815,44.774574,51.0578,0.2754894,0.121493
median_income,-5.572255e-07,-0.001300961,-0.0163663,0.570799,0.223204,0.406223,0.275489,0.3407375,0.175793
median_house_value,-1.529869e-06,-0.003080674,-0.01994184,0.243848,0.087026,0.102342,0.121493,0.1757929,0.431626


In [399]:
returns.corrwith( returns.population)


longitude             0.012562
latitude             -0.029664
housing_median_age   -0.007534
total_rooms           0.584765
total_bedrooms        0.799048
population            1.000000
households            0.817101
median_income         0.090746
median_house_value    0.020313
dtype: float64

## Unique Values, Value Counts, and Membership

In [0]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()

In [403]:
obj.value_counts()


c    3
a    3
b    2
d    1
dtype: int64

In [404]:
pd.value_counts( obj.values , sort= False )

a    3
c    3
d    1
b    2
dtype: int64

In [407]:
print(obj)
mask = obj.isin( ["b","c"])
obj[mask]

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object


0    c
5    b
6    b
7    c
8    c
dtype: object

In [408]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [412]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4,10],
                     'Qu2': [2, 3, 1, 2, 3, 5],
                     'Qu3': [1, 5, 2, 4, 4, 7]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4
5,10,5,7


In [415]:
result = data.apply( pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,1.0,1.0
7,0.0,0.0,1.0
10,1.0,0.0,0.0


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,1000.0,2.0,1.0
3,2.0,2.0,1000.0
4,2.0,1000.0,2.0
5,1000.0,1000.0,1.0
