In [2]:
import numpy as np
import pandas as pd

In [60]:
# Creating a DataFrame by passing a NumPy array, with a datetime index using date_range() and labeled columns:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [61]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.34793,0.08836,0.097485,-1.091907
2013-01-02,0.101007,1.124334,1.737955,1.154725
2013-01-03,-0.861077,1.882735,0.240683,-0.121963
2013-01-04,0.610751,0.718401,-0.827766,0.432279
2013-01-05,1.581682,1.333995,-0.620752,1.04608
2013-01-06,0.291242,-0.744474,-0.240386,-1.119355


In [3]:
# Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        # "B": pd.Timestamp("20130102"),
        "B": pd.date_range("20130102", periods=4),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),          # 🔰 Series는 index가 필요하고
        # "C": pd.Series(1, index=pd.RangeIndex(5, 9, 1), dtype="float32"),    # index가 1부터 시작하는 이유    
        "D": np.array([3] * 4, dtype="int32"),                               # 🔰 np.array는 index가 없다
        "E": pd.Categorical(["test", "train", "test", "train"]),             # 🔰
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-03,1.0,3,train,foo
2,1.0,2013-01-04,1.0,3,test,foo
3,1.0,2013-01-05,1.0,3,train,foo


In [5]:
index = tuple(range(4))
index

(0, 1, 2, 3)

In [63]:
# Categorical data
# https://pandas.pydata.org/docs/user_guide/categorical.html

In [64]:
# Selection, Getting, Setting, Boolean Indexing
df.sort_index(axis=0, ascending=False).sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-06,-1.119355,-0.240386,-0.744474,0.291242
2013-01-05,1.04608,-0.620752,1.333995,1.581682
2013-01-04,0.432279,-0.827766,0.718401,0.610751
2013-01-03,-0.121963,0.240683,1.882735,-0.861077
2013-01-02,1.154725,1.737955,1.124334,0.101007
2013-01-01,-1.091907,0.097485,0.08836,-1.34793


In [65]:
df.loc[dates[0]]
# 🔰 

A   -1.347930
B    0.088360
C    0.097485
D   -1.091907
Name: 2013-01-01 00:00:00, dtype: float64

In [66]:
# For getting a scalar value:
df.loc[dates[0], "A"]

-1.3479297119149003

In [67]:
# For getting fast access to a scalar (equivalent to the prior method):
df.at[dates[0], "A"]

-1.3479297119149003

In [68]:
df.iloc[1, 1]

1.1243336470000842

In [69]:
df.iat[1, 1]

1.1243336470000842

In [70]:
# Boolean Indexing
df > 0

Unnamed: 0,A,B,C,D
2013-01-01,False,True,True,False
2013-01-02,True,True,True,True
2013-01-03,False,True,True,False
2013-01-04,True,True,False,True
2013-01-05,True,True,False,True
2013-01-06,True,False,False,False


In [71]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.08836,0.097485,
2013-01-02,0.101007,1.124334,1.737955,1.154725
2013-01-03,,1.882735,0.240683,
2013-01-04,0.610751,0.718401,,0.432279
2013-01-05,1.581682,1.333995,,1.04608
2013-01-06,0.291242,,,


In [72]:
df["A"] > 0

2013-01-01    False
2013-01-02     True
2013-01-03    False
2013-01-04     True
2013-01-05     True
2013-01-06     True
Freq: D, Name: A, dtype: bool

In [73]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.101007,1.124334,1.737955,1.154725
2013-01-04,0.610751,0.718401,-0.827766,0.432279
2013-01-05,1.581682,1.333995,-0.620752,1.04608
2013-01-06,0.291242,-0.744474,-0.240386,-1.119355


In [74]:
df1 = df.copy()
df1['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.34793,0.08836,0.097485,-1.091907,one
2013-01-02,0.101007,1.124334,1.737955,1.154725,one
2013-01-03,-0.861077,1.882735,0.240683,-0.121963,two
2013-01-04,0.610751,0.718401,-0.827766,0.432279,three
2013-01-05,1.581682,1.333995,-0.620752,1.04608,four
2013-01-06,0.291242,-0.744474,-0.240386,-1.119355,three


In [75]:
df1[df1['E'].isin(['two', 'four'])]     # 🔰

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.861077,1.882735,0.240683,-0.121963,two
2013-01-05,1.581682,1.333995,-0.620752,1.04608,four


In [76]:
# df1.loc[:, 'D'] = np.array([5] * len(df1.index))      
df1.loc[:, 'D'] = list(range(1, len(df1.index)+1, 1))      
# .at[:, 'D'] 작동하지 않는다.
# len(df)는 df의 row수를 구한다.
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.34793,0.08836,0.097485,1.0,one
2013-01-02,0.101007,1.124334,1.737955,2.0,one
2013-01-03,-0.861077,1.882735,0.240683,3.0,two
2013-01-04,0.610751,0.718401,-0.827766,4.0,three
2013-01-05,1.581682,1.333995,-0.620752,5.0,four
2013-01-06,0.291242,-0.744474,-0.240386,6.0,three


In [99]:
# A where operation with setting:
df3 = df1.iloc[:, :-1]
# dtypes이 object인 E열은 제외해 주지 않으면 Error발생  🔰

df3[df3 > 0] = -df3
# -만 붙여 주면 되네  🔰
df3

Unnamed: 0,A,B,C,D
2013-01-01,-1.34793,-0.08836,-0.097485,-1.0
2013-01-02,-0.101007,-1.124334,-1.737955,-2.0
2013-01-03,-0.861077,-1.882735,-0.240683,-3.0
2013-01-04,-0.610751,-0.718401,-0.827766,-4.0
2013-01-05,-1.581682,-1.333995,-0.620752,-5.0
2013-01-06,-0.291242,-0.744474,-0.240386,-6.0


In [None]:
# Missing Data