# 索引和复合索引

In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = './starbucks_store_worldwide.csv'

In [3]:
df = pd.read_csv(file_path)
print(df.head(1))

       Brand  Store Number     Store Name Ownership Type     Street Address  \
0  Starbucks  47370-257954  Meritxell, 96       Licensed  Av. Meritxell, 96   

               City State/Province Country Postcode Phone Number  \
0  Andorra la Vella              7      AD    AD500    376818720   

                  Timezone  Longitude  Latitude  
0  GMT+1:00 Europe/Andorra       1.53     42.51  


In [7]:
grouped1 = df[['Brand']].groupby(by = [df['Country'],df['State/Province']]).count()
# grouped2 = df.groupby(by = [df['Country'],df['State/Province']])[['Brand']].count()
# grouped3 = df.groupby(by = [df['Country'],df['State/Province']]).count()[['Brand']]

## 索引的方法和属性

####  .index 获取索引

In [8]:
print(grouped1.index)

MultiIndex([('AD',  '7'),
            ('AE', 'AJ'),
            ('AE', 'AZ'),
            ('AE', 'DU'),
            ('AE', 'FU'),
            ('AE', 'RK'),
            ('AE', 'SH'),
            ('AE', 'UQ'),
            ('AR',  'B'),
            ('AR',  'C'),
            ...
            ('US', 'UT'),
            ('US', 'VA'),
            ('US', 'VT'),
            ('US', 'WA'),
            ('US', 'WI'),
            ('US', 'WV'),
            ('US', 'WY'),
            ('VN', 'HN'),
            ('VN', 'SG'),
            ('ZA', 'GT')],
           names=['Country', 'State/Province'], length=545)


In [10]:
df1 = pd.DataFrame(np.ones((2,4)),index=['A','B'],columns=list('abcd'))
print(df1)

     a    b    c    d
A  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0


In [11]:
print(df1.index)

Index(['A', 'B'], dtype='object')


#### .index=['x','y'] 指定索引

In [12]:
df1.index = ['a','b']
print(df1)

     a    b    c    d
a  1.0  1.0  1.0  1.0
b  1.0  1.0  1.0  1.0


#### .reindex([....])

In [17]:
df2 = df1.reindex(['a','f'])  #相当于从df1中获取索引及对应的值，若没有则为NaN
print(df2)

     a    b    c    d
a  1.0  1.0  1.0  1.0
f  NaN  NaN  NaN  NaN


#### .set_index() 将某一列作为索引

In [19]:
df3 = df1.set_index('a')
print(df3)
print(df3.index)

       b    c    d
a                 
1.0  1.0  1.0  1.0
1.0  1.0  1.0  1.0
Float64Index([1.0, 1.0], dtype='float64', name='a')


In [21]:
df3 = df1.set_index('a',drop=False) #drop = False 保留原来a的值
print(df3)

       a    b    c    d
a                      
1.0  1.0  1.0  1.0  1.0
1.0  1.0  1.0  1.0  1.0


#### .index.unique() 返回索引的唯一值

In [23]:
df4 = df3.index.unique()
print(df4)

Float64Index([1.0], dtype='float64', name='a')


In [27]:
df1.loc['a','a'] = 100.0
print(df1)

       a    b    c    d
a  100.0  1.0  1.0  1.0
b    1.0  1.0  1.0  1.0


In [28]:
df5 = df1.set_index(['a','b'])
print(df5)
print(df5.index)

             c    d
a     b            
100.0 1.0  1.0  1.0
1.0   1.0  1.0  1.0
MultiIndex([(100.0, 1.0),
            (  1.0, 1.0)],
           names=['a', 'b'])


### 总结：索引是一个可迭代对象，可以用len求其长度，也可以用list()将其转换为列表等

In [30]:
a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})

In [31]:
print(a)

   a  b    c  d
0  0  7  one  h
1  1  6  one  j
2  2  5  one  k
3  3  4  two  l
4  4  3  two  m
5  5  2  two  n
6  6  1  two  o


In [47]:
b = a.set_index(['c','d'])
print(b)
print(type(b))

       a  b
c   d      
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1
<class 'pandas.core.frame.DataFrame'>


In [48]:
c = b['a']  #Series类型
print(c)
print(type(c))

c    d
one  h    0
     j    1
     k    2
two  l    3
     m    4
     n    5
     o    6
Name: a, dtype: int64
<class 'pandas.core.series.Series'>


In [37]:
print(c['one'])

d
h    0
j    1
k    2
Name: a, dtype: int64


In [36]:
print(c['one']['j'])  #也可以c['one','j']

1


In [38]:
d = a.set_index(['d','c'])['a']
print(d)
#此时想取one的值要怎么取

d  c  
h  one    0
j  one    1
k  one    2
l  two    3
m  two    4
n  two    5
o  two    6
Name: a, dtype: int64


In [45]:
print(d.swaplevel()['one'])  #利用swaplevel()调换内外层

d
h    0
j    1
k    2
Name: a, dtype: int64


In [54]:
print(b)  # b是DataFrame类型，取某行数据是用.loc ,对于Series 直接用[]就可
print('*'*20)
print(b.loc['one'].loc['h'])

       a  b
c   d      
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1
********************
a    0
b    7
Name: h, dtype: int64


In [53]:
#若想取所有的 h 数据
print(b.swaplevel().loc['h'])

     a  b
c        
one  0  7
