In [45]:
import numpy as np
import pandas as pd

# Different Choices for Indexing.
dates = pd.date_range('1/1/2023', periods= 8)

df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2023-01-01,1.187569,-0.564931,1.207602,0.037394
2023-01-02,-0.253694,0.25773,0.240397,-1.279029
2023-01-03,0.146077,1.128171,0.218116,-1.171722
2023-01-04,-0.924897,0.218776,1.182334,-1.9976
2023-01-05,0.605353,1.105675,-0.653183,-0.167644
2023-01-06,-1.22991,0.024397,1.454701,-1.047088
2023-01-07,0.142517,-0.81397,-1.005585,0.694203
2023-01-08,-0.132447,1.166136,-0.574012,-0.036711


In [46]:
df['A']

2023-01-01    1.187569
2023-01-02   -0.253694
2023-01-03    0.146077
2023-01-04   -0.924897
2023-01-05    0.605353
2023-01-06   -1.229910
2023-01-07    0.142517
2023-01-08   -0.132447
Freq: D, Name: A, dtype: float64

> * df.loc[...]는 row에 대한 indexing   
> * index가 `별도의 name`을 갖고 있다면, 이를 하나의 Series로 보고 indexing하여, row를 명시한다.   ㊙️㊙️  

In [47]:
df.loc[dates[3]]

A   -0.924897
B    0.218776
C    1.182334
D   -1.997600
Name: 2023-01-04 00:00:00, dtype: float64

In [49]:
# df[dates[5]]
# It Doesn't work. df[...]는 column에 대한 indexing.

In [3]:
# column 바꿔치기
df[['A', 'B']] = df[['B', 'A']]
df

Unnamed: 0,A,B,C,D
2023-01-01,2.73806,-0.582577,-0.052241,0.878912
2023-01-02,-0.067055,-0.174403,-0.209351,-0.456515
2023-01-03,-1.104771,-2.183292,0.752805,-0.404331
2023-01-04,-0.828653,0.352372,-0.562476,1.104729
2023-01-05,1.810223,-1.991367,0.39692,-1.091852
2023-01-06,0.217394,-1.382665,-1.289925,-0.611896
2023-01-07,1.269375,-0.20827,2.022008,0.423413
2023-01-08,0.947019,1.304217,0.878158,-1.007074


In [4]:
df[['A', 'B']]

Unnamed: 0,A,B
2023-01-01,2.73806,-0.582577
2023-01-02,-0.067055,-0.174403
2023-01-03,-1.104771,-2.183292
2023-01-04,-0.828653,0.352372
2023-01-05,1.810223,-1.991367
2023-01-06,0.217394,-1.382665
2023-01-07,1.269375,-0.20827
2023-01-08,0.947019,1.304217


In [5]:
# left는 column alignment, right는 value assignment(?)
df.loc[:, ['B', 'A']] = df[['A', 'B']]
df
# df.loc[:, ['B', 'A']]

Unnamed: 0,A,B,C,D
2023-01-01,2.73806,-0.582577,-0.052241,0.878912
2023-01-02,-0.067055,-0.174403,-0.209351,-0.456515
2023-01-03,-1.104771,-2.183292,0.752805,-0.404331
2023-01-04,-0.828653,0.352372,-0.562476,1.104729
2023-01-05,1.810223,-1.991367,0.39692,-1.091852
2023-01-06,0.217394,-1.382665,-1.289925,-0.611896
2023-01-07,1.269375,-0.20827,2.022008,0.423413
2023-01-08,0.947019,1.304217,0.878158,-1.007074


In [6]:
# to_numpy()는 DataFrame을 numpy array로 변환한다. i.e. index label이 의미가 없어진다.   ㊙️
df.loc[:, ['B', 'A']] = df[['A', 'B']].to_numpy()

In [7]:
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.582577,2.73806,-0.052241,0.878912
2023-01-02,-0.174403,-0.067055,-0.209351,-0.456515
2023-01-03,-2.183292,-1.104771,0.752805,-0.404331
2023-01-04,0.352372,-0.828653,-0.562476,1.104729
2023-01-05,-1.991367,1.810223,0.39692,-1.091852
2023-01-06,-1.382665,0.217394,-1.289925,-0.611896
2023-01-07,-0.20827,1.269375,2.022008,0.423413
2023-01-08,1.304217,0.947019,0.878158,-1.007074


In [8]:
# Attribute access(.)
# You may access an index on a Series or column on a DataFrame directly as an attribute.

sa = pd.Series([1, 2, 3], index=list('abc'))
dfa = df.copy()


In [9]:
sa.b

2

In [10]:
dfa.A

2023-01-01   -0.582577
2023-01-02   -0.174403
2023-01-03   -2.183292
2023-01-04    0.352372
2023-01-05   -1.991367
2023-01-06   -1.382665
2023-01-07   -0.208270
2023-01-08    1.304217
Freq: D, Name: A, dtype: float64

In [11]:
sa.a = 5
sa

a    5
b    2
c    3
dtype: int64

In [12]:
list(range(len(dfa.index)))

[0, 1, 2, 3, 4, 5, 6, 7]

In [13]:
dfa.A = list(range(1, len(dfa.index)+1))
# Series에 list를 대입한다.   ㊙️


In [14]:
dfa

Unnamed: 0,A,B,C,D
2023-01-01,1,2.73806,-0.052241,0.878912
2023-01-02,2,-0.067055,-0.209351,-0.456515
2023-01-03,3,-1.104771,0.752805,-0.404331
2023-01-04,4,-0.828653,-0.562476,1.104729
2023-01-05,5,1.810223,0.39692,-1.091852
2023-01-06,6,0.217394,-1.289925,-0.611896
2023-01-07,7,1.269375,2.022008,0.423413
2023-01-08,8,0.947019,0.878158,-1.007074


In [15]:
# When Creating a new column, Don't use DataFrame.column
dfa["E"] = list(range(len(dfa.index)))
dfa

Unnamed: 0,A,B,C,D,E
2023-01-01,1,2.73806,-0.052241,0.878912,0
2023-01-02,2,-0.067055,-0.209351,-0.456515,1
2023-01-03,3,-1.104771,0.752805,-0.404331,2
2023-01-04,4,-0.828653,-0.562476,1.104729,3
2023-01-05,5,1.810223,0.39692,-1.091852,4
2023-01-06,6,0.217394,-1.289925,-0.611896,5
2023-01-07,7,1.269375,2.022008,0.423413,6
2023-01-08,8,0.947019,0.878158,-1.007074,7


In [16]:
# Identifiers 命名 규칙<https://docs.python.org/3/reference/lexical_analysis.html>
# 숫자가 제일 앞에 와서는 않된다.

# The attribute will not be available if it conflicts with an existing method name, 
# e.g. s.min is not allowed, but s['min'] is possible.
# method를 호출하는 경우도 s['method']로 가능하다.   ㊙️

##### [Warning: Valid Identifier](https://pandas.pydata.org/docs/user_guide/indexing.html#:~:text=You%20can%20use,element%20or%20column)
* You can use this access only if the index element is a valid Python identifier, e.g. s.1 is not allowed. See here for an explanation of valid identifiers.

* The attribute will not be available if it conflicts with an existing method name, e.g. s.min is not allowed, but s['min'] is possible.

* Similarly, the attribute will not be available if it conflicts with any of the following list: index, major_axis, minor_axis, items.

* In any of these cases, standard indexing will still work, e.g. s['1'], s['min'], and s['index'] will access the corresponding element or column. (?)🤔

In [17]:
x = pd.DataFrame({"x":[1, 2, 3], "y":[3, 4, 5]})
x.iloc[1] = {"x":9, "y":99}
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


In [18]:
df = pd.DataFrame({'one': [1., 2., 3.]})
df.two = [4, 5, 6]

  df.two = [4, 5, 6]


In [None]:
# Slicing Ranges


In [75]:
# Selection by label(df.loc[...])
date = pd.date_range('20130101', periods=5, name='日付')
dfl = pd.DataFrame(np.random.randn(5, 4),
                   columns=list('ABCD'),
                   index=date)
dfl

Unnamed: 0_level_0,A,B,C,D
日付,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,0.896601,0.340204,-1.759166,0.467655
2013-01-02,1.007708,0.751419,1.699561,-1.120244
2013-01-03,-2.140343,-1.173291,0.036782,2.327997
2013-01-04,-0.153715,0.6945,-0.516761,-0.380956
2013-01-05,-0.442451,1.378753,1.18752,-0.425404


In [76]:
# dfl.loc[2:3]   # interger slicing을 사용할 수 없다.   😵‍💫
# When slicing, both the start bound AND the stop bound are included
# dfl.loc["2013-01-02":"2013-01-04"]   # 작동
# dfl.loc["20130102":"20130104"]   # 작동
dfl.loc[date[1:4], ["A","D"]]

# date라는 Series를 별도로 생성하고 이를 dfl.loc[...]에 인자로 넘긴다. 
# dfl를 정의하기 이전에 date라는 Series를 별도로 정의한다.   ㊙️


Unnamed: 0_level_0,A,D
日付,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,1.007708,-1.120244
2013-01-03,-2.140343,2.327997
2013-01-04,-0.153715,-0.380956


In [78]:
dfl.loc[[True, False, True, False, False]]
# Boolean array(e.g. True, False list)을 이용한다.
# Boolean array를 return하는 conditionals도 쓸 수 있다.

Unnamed: 0_level_0,A,B,C,D
日付,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,0.896601,0.340204,-1.759166,0.467655
2013-01-03,-2.140343,-1.173291,0.036782,2.327997


In [79]:
s1 = pd.Series(np.random.randn(6), index=list("abcdef"))
s1

a    1.615941
b    1.456952
c    0.506330
d   -0.946885
e    1.011785
f   -0.717335
dtype: float64

In [80]:
s1.loc["c":]

c    0.506330
d   -0.946885
e    1.011785
f   -0.717335
dtype: float64

In [151]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.7601,0.99218,-0.90086,-0.563569
b,-0.52366,-0.911905,-1.170232,0.765346
c,-2.158469,-0.161224,0.124303,0.185376
d,-0.401261,0.42802,-0.220995,-0.668876
e,1.333502,0.4253,0.402161,1.457772
f,0.060663,-1.279536,0.495353,0.385579


In [83]:
df1.loc[['a', 'b', 'd'], 'B':'C']

Unnamed: 0,B,C
a,2.007654,1.071816
b,-1.78061,0.698415
d,0.083834,-0.377288


In [84]:
df1.loc['a']

A   -1.040964
B    2.007654
C    1.071816
D    0.745387
Name: a, dtype: float64

In [87]:
df1.loc['a'] > 0

A    False
B     True
C     True
D     True
Name: a, dtype: bool

In [96]:
# row조건을 걸어 column을 추출   🪷
df1.loc[:, df1.loc['b'] > 0]

Unnamed: 0,A,C
a,-1.040964,1.071816
b,2.443265,0.698415
c,-0.72043,-1.24418
d,0.714732,-0.377288
e,-0.025356,-0.061255
f,-1.227753,-0.177213


In [100]:
# column에 조건을 걸어 row를 추출   🪷
df1.loc[df1["A"] >0, :]

Unnamed: 0,A,B,C,D
b,2.443265,-1.78061,0.698415,-0.349839
d,0.714732,0.083834,-0.377288,-0.534243


In [130]:
df1.loc[["a", "b"], ["A", "B"]]

Unnamed: 0,A,B
a,-1.040964,2.007654
b,2.443265,-1.78061


In [133]:
# condition을 중복하여 건다. 각 조건마다(..)를 사용한다.㊙️
# 이를 해석해 보면, df.loc[...]안에 들어 가는 ...는 앞자리는 row, 뒷자리는 column에 대한 label 또는 boolean array이다.
# boolean array를 만들 수 있는 것은 1.conditionals(조건식), isin([...])에 들어가는 2.list
df1.loc[(df1["A"] >0) & (df1["B"] >0)]

Unnamed: 0,A,B,C,D
d,0.714732,0.083834,-0.377288,-0.534243


In [127]:
df1.loc[['a', 'c'], ['B', 'D']]

Unnamed: 0,B,D
a,2.007654,0.745387
c,0.146781,0.674421


In [159]:
# Boolean masking
mask1 = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
mask2 = pd.array([False, True, False, True])

In [163]:
# 😵‍💫
df1[mask1]

Unnamed: 0,A,B,C,D
a,0.7601,0.99218,-0.90086,-0.563569
c,-2.158469,-0.161224,0.124303,0.185376


In [161]:
df1.loc[mask1]

Unnamed: 0,A,B,C,D
a,0.7601,0.99218,-0.90086,-0.563569
c,-2.158469,-0.161224,0.124303,0.185376


In [160]:
df1.loc[mask1, mask2]

Unnamed: 0,B,D
a,0.99218,-0.563569
c,-0.161224,0.185376


* 왜 위 두개가 같은 결과를 낼까?🤔😵‍💫
* In my opinion, df1[mask]에서 boolean array가 대입되는 경우는 index에만 masking한다.매우 특이한 경우😵‍💫
* df1.lox[mask]에서 mask가 row자리에 있기 때문에 row를 masking하고 있고, column자리에 mask가 대입할 수 있다.


In [138]:
# Slicing with labels
s = pd.Series(list("abcde"), index=[0, 3, 2, 5, 4])
s

0    a
3    b
2    c
5    d
4    e
dtype: object

<span style="font-size:13px;">
<summary><h3>Valid Inputs㊙️</h3></summary>
<details>

1. A <i>`single label`<i> , e.g. 5 or 'a' (Note that 5 is interpreted as a label of the index. This use is not an integer position along the index.).
  
2. A <i>`list or array of labels`<i> ['a', 'b', 'c'].

3. A <i>`slice object`<i> with labels <i>`'a':'f'`<i> (Note that contrary to usual Python slices, both the start and the stop are included, when present in the index! See Slicing with labels.

4. A <i>`boolean array`<i>.

5. A <i>`callable`<i>, see Selection By Callable.

</details>
</span>

In [140]:
s.loc[3:5]
# label로 slicing하는 경우 숫자라도 문자로 인식한다.

3    b
2    c
5    d
dtype: object

In [142]:
# s.loc['3':'5']
# 逆切れに（賊反荷杖）Errorが生じる。   😵‍💫

In [143]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [144]:
s.sort_index().loc[1:6]
# 없는 문자까지 포함하여 충실히 selection by label을 이행한다.

2    c
3    b
4    e
5    d
dtype: object

In [None]:
# .loc[start:stop]에서 한개 이상의 index값이 없고, sorting되어 있지 않다면 an error will be raised.(sorting되어 있다면 OK란 뜻)
# s.loc[1:6]은 Error.(Sorting 되어 있지 않아 NG)

In [145]:
s.loc[3:5]
# End points are inclusive.

3    b
2    c
5    d
dtype: object

<span style="font-size: 13px;">
<h3>Selection by position:</h3>

> 0-based indexing
>   
> purely integer based indexing

<details><summary>Valid Inputs</summary>

   - **`An integer`** e.g. 5.

   - **`A list or array of integers`** `[4, 3, 0]`.
  
   - **`A slice object`** with ints 1:7.
  
   - **`A boolean array`**.
  
   - **`A callable`**, see Selection By Callable.
</detaila>
</span>

In [146]:
s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1

0    1.370371
2    0.329040
4   -0.928459
6    0.265704
8    0.162912
dtype: float64

In [148]:
# label slicing
s1.loc[:3]

0    1.370371
2    0.329040
dtype: float64

In [147]:
# Series에 대한 slicing
s1[:3]

0    1.370371
2    0.329040
4   -0.928459
dtype: float64

In [149]:
# Position slicing
s1.iloc[:3]

0    1.370371
2    0.329040
4   -0.928459
dtype: float64