In [1]:
import numpy as np
import pandas as pd

# Лекция 4
## Мультииндекс

Для размерности данных больше 2 можно использовать иерархическую индексацию или мультииндекс. В этом случае в один индекс включается несколько уровней.

Без использования мультииндекса будет так:

In [2]:
index = [
    ('city_1', 2010),
    ('city_1', 2020),
    ('city_2', 2010),
    ('city_2', 2020),
    ('city_3', 2010),
    ('city_3', 2020),
]

population = [
    101,
    201,
    102,
    202,
    103,
    203
]

pop = pd.Series(population, index = index)
print(pop)
print(pop[  [i for i in pop.index if i[1] == 2020]  ])

(city_1, 2010)    101
(city_1, 2020)    201
(city_2, 2010)    102
(city_2, 2020)    202
(city_3, 2010)    103
(city_3, 2020)    203
dtype: int64
(city_1, 2020)    201
(city_2, 2020)    202
(city_3, 2020)    203
dtype: int64


Преобразуем в мультииндекс:

In [3]:
index = [
    ('city_1', 2010),
    ('city_1', 2020),
    ('city_2', 2010),
    ('city_2', 2020),
    ('city_3', 2010),
    ('city_3', 2020),
]

population = [
    101,
    201,
    102,
    202,
    103,
    203
]

pop = pd.Series(population, index = index)
print(pop)

index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
print(pop)
print(pop[:, 2020])

(city_1, 2010)    101
(city_1, 2020)    201
(city_2, 2010)    102
(city_2, 2020)    202
(city_3, 2010)    103
(city_3, 2020)    203
dtype: int64
city_1  2010    101
        2020    201
city_2  2010    102
        2020    202
city_3  2010    103
        2020    203
dtype: int64
city_1    201
city_2    202
city_3    203
dtype: int64


Такой мультииндексный Series можно перевести в DataFrame и обратно с помощью .unstack и .stack:

In [4]:
index = [
    ('city_1', 2010),
    ('city_1', 2020),
    ('city_2', 2010),
    ('city_2', 2020),
    ('city_3', 2010),
    ('city_3', 2020),
]

population = [
    101,
    201,
    102,
    202,
    103,
    203
]

pop = pd.Series(population, index = index)
index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)

pop_df = pop.unstack()
print(pop_df)

print(pop_df.stack())

        2010  2020
city_1   101   201
city_2   102   202
city_3   103   203
city_1  2010    101
        2020    201
city_2  2010    102
        2020    202
city_3  2010    103
        2020    203
dtype: int64


Мультииндекс большей размерности:

In [27]:
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    
    ('city_3', 2020, 1),
    ('city_3', 2020, 2)
]

population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]

pop = pd.Series(population, index = index)
print(pop, '\n')

index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
print(pop, '\n')
print(pop[:, 2010], '\n')
print(pop[:, :, 2], '\n')

(city_1, 2010, 1)     101
(city_1, 2010, 2)    1010
(city_1, 2020, 1)     201
(city_1, 2020, 2)    2010
(city_2, 2010, 1)     102
(city_2, 2010, 2)    1020
(city_2, 2020, 1)     202
(city_2, 2020, 2)    2020
(city_3, 2010, 1)     103
(city_3, 2010, 2)    1030
(city_3, 2020, 1)     203
(city_3, 2020, 2)    2030
dtype: int64 

city_1  2010  1     101
              2    1010
        2020  1     201
              2    2010
city_2  2010  1     102
              2    1020
        2020  1     202
              2    2020
city_3  2010  1     103
              2    1030
        2020  1     203
              2    2030
dtype: int64 

city_1  1     101
        2    1010
city_2  1     102
        2    1020
city_3  1     103
        2    1030
dtype: int64 

city_1  2010    1010
        2020    2010
city_2  2010    1020
        2020    2020
city_3  2010    1030
        2020    2030
dtype: int64 



Аналогичное превращение в DataFrame:

In [28]:
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    
    ('city_3', 2020, 1),
    ('city_3', 2020, 2)
]

population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]

index = pd.MultiIndex.from_tuples(index)
pop = pd.Series(population, index = index)

pop_df = pop.unstack()
print(pop_df, '\n')
print(pop_df.stack, '\n')

               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030 

<bound method DataFrame.stack of                1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030> 



С помощью .loc и использования списка ключей, можно задавать различные комбинации ключей различных уровней. Если используется более одного ключа, то необходимо добавить ещё один вложенный список в аргумент .loc, даже если не происходит фильтрации по колонкам (см. последний пример)

In [32]:
index = [
    ('city_1', 2010),
    ('city_1', 2020),
    ('city_2', 2010),
    ('city_2', 2020),
    ('city_3', 2010),
    ('city_3', 2020),
]

population = [
    101,
    201,
    102,
    202,
    103,
    203
]

index = pd.MultiIndex.from_tuples(index)
pop = pd.Series(population, index = index)
pop_df = pd.DataFrame({
    'total' : pop,
    'something' : [
        10,
        11,
        12,
        13,
        14,
        15,
    ]
})

print(pop_df,'\n')

print(pop_df.loc['city_1', 'something'])
print(pop_df.loc[['city_1', 'city_3']],'\n')

             total  something
city_1 2010    101         10
       2020    201         11
city_2 2010    102         12
       2020    202         13
city_3 2010    103         14
       2020    203         15 

2010    10
2020    11
Name: something, dtype: int64
             total  something
city_1 2010    101         10
       2020    201         11
city_3 2010    103         14
       2020    203         15 



## Как создаются мультииндексы?
1. Список массивов, задающих значение на каждом уровне
2. Список кортежей, задающих значение индекса в каждой точке
3. Декартово произведенеие обычных индексов
4. Описание внутреннего представления: levels(список списков), codes(список списков меток)

In [33]:
# Список массивов (в т.ч. списков)
i1 = pd.MultiIndex.from_arrays([
    ['a', 'a', 'b', 'b'],
    [1, 2, 1, 2]
])

print(i1)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [34]:
# Список кортежей
i2 = pd.MultiIndex.from_tuples([
    ('a', 1),
    ('a', 1),
    ('b', 2),
    ('b', 2)
])

print(i2)

MultiIndex([('a', 1),
            ('a', 1),
            ('b', 2),
            ('b', 2)],
           )


In [10]:
# Декартово произведение (все возможные пары из вложенных массивов аргумента)
i3 = pd.MultiIndex.from_product([
    ['a', 'b'],
    [1, 2]
])

print(i3)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [11]:
# levels(список списков), codes(список списков меток)
i4 = pd.MultiIndex(
    levels=[
        ['a', 'b', 'c'],
        [1, 2]
    ],
    codes = [
        [0, 0, 1, 1, 2, 2], # a a b b c c
        [0, 1, 0, 1, 0, 1]  # 1 2 1 2 1 2
    ]
)

print(i4)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2)],
           )


Уровням индексов можно задавать названия

In [12]:
data = {
    ('city_1', 2010) : 100,
    ('city_1', 2020) : 200,
    ('city_2', 2010) : 1001,
    ('city_2', 2020) : 2001
}

s = pd.Series(data)
print(s)

s.index.names = ['city', 'year']
print(s)

city_1  2010     100
        2020     200
city_2  2010    1001
        2020    2001
dtype: int64
city    year
city_1  2010     100
        2020     200
city_2  2010    1001
        2020    2001
dtype: int64


Названия уровней мультииндексов можно использовать также и в DataFrames. А ещё логику мультиндексов можно применить к columns, и уровням columns можно так же дать названия:

In [37]:
index = pd.MultiIndex.from_product([
    ['city_1', 'city_2'],
    [2010, 2020]
    ],
    names = ['city', 'year']
)

columns = pd.MultiIndex.from_product([
    ['person_1', 'person_2', 'person_3'],
    ['job_1', 'job_2']
    ],
    names = ['worker', 'job']
)

rng = np.random.default_rng(1)
data = rng.random((4,6))

data_df = pd.DataFrame(data, index=index, columns=columns)
print(data_df)

worker       person_1            person_2            person_3          
job             job_1     job_2     job_1     job_2     job_1     job_2
city   year                                                            
city_1 2010  0.511822  0.950464  0.144160  0.948649  0.311831  0.423326
       2020  0.827703  0.409199  0.549594  0.027559  0.753513  0.538143
city_2 2010  0.329732  0.788429  0.303195  0.453498  0.134042  0.403113
       2020  0.203455  0.262313  0.750365  0.280409  0.485191  0.980737


# Индексация и срезы (по мультииндексу)

In [14]:
data = {
    ('city_1', 2010) : 100,
    ('city_1', 2020) : 200,
    ('city_2', 2010) : 1001,
    ('city_2', 2020) : 2001,
    ('city_3', 2010) : 10001,
    ('city_3', 2020) : 20001
}

s = pd.Series(data)
s.index.names = ['city', 'year']

print(s['city_1', 2010])
print(s['city_1'])

100
year
2010    100
2020    200
dtype: int64


In [15]:
data = {
    ('city_1', 2010) : 100,
    ('city_1', 2020) : 200,
    ('city_2', 2010) : 1001,
    ('city_2', 2020) : 2001,
    ('city_3', 2010) : 10001,
    ('city_3', 2020) : 20001
}

s = pd.Series(data)
s.index.names = ['city', 'year']

print(s.loc['city_1':'city_2'])
print(s[:,2010])

city    year
city_1  2010     100
        2020     200
city_2  2010    1001
        2020    2001
dtype: int64
city
city_1      100
city_2     1001
city_3    10001
dtype: int64


In [16]:
data = {
    ('city_1', 2010) : 100,
    ('city_1', 2020) : 200,
    ('city_2', 2010) : 1001,
    ('city_2', 2020) : 2001,
    ('city_3', 2010) : 10001,
    ('city_3', 2020) : 20001
}

s = pd.Series(data)
s.index.names = ['city', 'year']

print(s[s > 2000])
print(s[['city_1','city_3']])

city    year
city_2  2020     2001
city_3  2010    10001
        2020    20001
dtype: int64
city    year
city_1  2010      100
        2020      200
city_3  2010    10001
        2020    20001
dtype: int64


In [17]:
index = pd.MultiIndex.from_product([
    ['city_1', 'city_2'],
    [2010, 2020]
    ],
    names = ['city', 'year']
)
print(index)

columns = pd.MultiIndex.from_product([
    ['person_1', 'person_2', 'person_3'],
    ['job_1', 'job_2']
    ],
    names = ['worker', 'job']
)
rng = np.random.default_rng(1)
data = rng.random((4,6))

data_df = pd.DataFrame(data, index=index, columns=columns)
print(data_df)

MultiIndex([('city_1', 2010),
            ('city_1', 2020),
            ('city_2', 2010),
            ('city_2', 2020)],
           names=['city', 'year'])
worker       person_1            person_2            person_3          
job             job_1     job_2     job_1     job_2     job_1     job_2
city   year                                                            
city_1 2010  0.511822  0.950464  0.144160  0.948649  0.311831  0.423326
       2020  0.827703  0.409199  0.549594  0.027559  0.753513  0.538143
city_2 2010  0.329732  0.788429  0.303195  0.453498  0.134042  0.403113
       2020  0.203455  0.262313  0.750365  0.280409  0.485191  0.980737


# Перегруппировка мультииндексов

In [18]:
rng = np.random.default_rng(1)

index = pd.MultiIndex.from_product([
    ['a', 'c', 'b'],
    [1,2]
])

data = pd.Series(rng.random(6), index=index)
data.index.names = ['char', 'int']

print(data)
#print(data['a':'b'])

data = data.sort_index()
print(data)
print(data['a':'b'])

char  int
a     1      0.511822
      2      0.950464
c     1      0.144160
      2      0.948649
b     1      0.311831
      2      0.423326
dtype: float64
char  int
a     1      0.511822
      2      0.950464
b     1      0.311831
      2      0.423326
c     1      0.144160
      2      0.948649
dtype: float64
char  int
a     1      0.511822
      2      0.950464
b     1      0.311831
      2      0.423326
dtype: float64


Можно перегруппировывать мультииндексы в желаемой последовательности:

In [19]:
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    
    ('city_3', 2020, 1),
    ('city_3', 2020, 2)
]

population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]

pop = pd.Series(population, index=index)

i = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(i)

print(pop)
print(pop.unstack())
print(pop.unstack(level=0))
print(pop.unstack(level=1))
print(pop.unstack(level=2))

city_1  2010  1     101
              2    1010
        2020  1     201
              2    2010
city_2  2010  1     102
              2    1020
        2020  1     202
              2    2020
city_3  2010  1     103
              2    1030
        2020  1     203
              2    2030
dtype: int64
               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030
        city_1  city_2  city_3
2010 1     101     102     103
     2    1010    1020    1030
2020 1     201     202     203
     2    2010    2020    2030
          2010  2020
city_1 1   101   201
       2  1010  2010
city_2 1   102   202
       2  1020  2020
city_3 1   103   203
       2  1030  2030
               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030


# NumPy конкатенация

In [20]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

print(np.concatenate([x, y, z]))

[1 2 3 4 5 6 7 8 9]


In [21]:
x = [[1, 2, 3]]
y = [[4, 5, 6]]
z = [[7, 8, 9]]

print(np.concatenate([x, y, z]))
print(np.concatenate([x, y, z], axis=1))
print(np.concatenate([x, y, z], axis=0))

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 2 3 4 5 6 7 8 9]]
[[1 2 3]
 [4 5 6]
 [7 8 9]]


# Pandas - конкатенация

In [22]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1,2,3])
ser2 = pd.Series(['d', 'e', 'f'], index=[4,5,6])

print(pd.concat([ser1,ser2]))

1    a
2    b
3    c
4    d
5    e
6    f
dtype: object


In [23]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1,2,3])
ser2 = pd.Series(['d', 'e', 'f'], index=[1,2,6])

print(pd.concat([ser1,ser2]))

1    a
2    b
3    c
1    d
2    e
6    f
dtype: object


In [24]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1,2,3])
ser2 = pd.Series(['d', 'e', 'f'], index=[1,2,6])

print(pd.concat([ser1,ser2], verify_integrity=False))
print(pd.concat([ser1,ser2], ignore_index=True))
print(pd.concat([ser1,ser2], keys=['x', 'y']))

1    a
2    b
3    c
1    d
2    e
6    f
dtype: object
0    a
1    b
2    c
3    d
4    e
5    f
dtype: object
x  1    a
   2    b
   3    c
y  1    d
   2    e
   6    f
dtype: object


In [25]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1,2,3])
ser2 = pd.Series(['b', 'c', 'f'], index=[4,5,6])

print(pd.concat([ser1, ser2], join='outer'))
print(pd.concat([ser1, ser2], join='inner'))

1    a
2    b
3    c
4    b
5    c
6    f
dtype: object
1    a
2    b
3    c
4    b
5    c
6    f
dtype: object
