In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
# Example from - https://chrisalbon.com/python/pandas_map_values_to_values.html
# 보통 이렇게 생성하지 않음
raw_data = {
    "first_name": ["Jason", "Molly", "Tina", "Jake", "Amy"],
    "last_name": ["Miller", "Jacobson", "Ali", "Milner", "Cooze"],
    "age": [42, 52, 36, 24, 73],
    "city": ["San Francisco", "Baltimore", "Miami", "Douglas", "Boston"],
}
df = pd.DataFrame(raw_data, columns=["first_name", "last_name", "age", "city"])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [3]:
raw_data

{'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
 'age': [42, 52, 36, 24, 73],
 'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}

In [4]:
DataFrame(raw_data, columns=["age", "city"])  # column 선택

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


In [5]:
# 새로운 column 추가
# debt - NaN (Not a Number)
DataFrame(raw_data, columns=["first_name", "last_name", "age", "city", "debt"])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [6]:
df = DataFrame(raw_data, columns=["first_name", "last_name", "age", "city", "debt"])
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [7]:
df.first_name  # column 선택 - series 추출
# property 형태로 접근

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [8]:
df["first_name"]  # column 선택 - series 추출
# dict의 key 값처럼

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [10]:
type(df["first_name"])

pandas.core.series.Series

In [11]:
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [12]:
df.loc[1]  # loc - index location
# index 문자면 문자 넣어줘야 함

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

In [13]:
df.loc[:, ["last_name"]]

Unnamed: 0,last_name
0,Miller
1,Jacobson
2,Ali
3,Milner
4,Cooze


In [14]:
df["age"].iloc[1:]  # iloc - index position
# 숫자로 변형해서 해당 값에 접근할 수 있다

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [17]:
df.loc[:3]  # index 이름이 3인 것까지

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,


In [18]:
df.iloc[:3]  # range(0, 3)

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,


In [15]:
# Example from - https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation
s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5])
s.loc[:3]  # 3이 있는 모든 값가지 가져온다

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [16]:
s.iloc[:3]  # index 3까지 가져온다 (0, 1, 2)

49   NaN
48   NaN
47   NaN
dtype: float64

In [19]:
# boolean index
df.age > 40

0     True
1     True
2    False
3    False
4     True
Name: age, dtype: bool

In [20]:
df.debt = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [21]:
values = Series(data=["M", "F", "F"], index=[0, 1, 3])
values

0    M
1    F
3    F
dtype: object

In [22]:
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [23]:
df["sex"] = values
df  # values에서 비어있는 index는 NaN 값으로 들어간다

Unnamed: 0,first_name,last_name,age,city,debt,sex
0,Jason,Miller,42,San Francisco,True,M
1,Molly,Jacobson,52,Baltimore,True,F
2,Tina,Ali,36,Miami,False,
3,Jake,Milner,24,Douglas,False,F
4,Amy,Cooze,73,Boston,True,


In [24]:
df.head(3).T  # transpose

Unnamed: 0,0,1,2
first_name,Jason,Molly,Tina
last_name,Miller,Jacobson,Ali
age,42,52,36
city,San Francisco,Baltimore,Miami
debt,True,True,False
sex,M,F,


In [25]:
df.values  # values

array([['Jason', 'Miller', 42, 'San Francisco', True, 'M'],
       ['Molly', 'Jacobson', 52, 'Baltimore', True, 'F'],
       ['Tina', 'Ali', 36, 'Miami', False, nan],
       ['Jake', 'Milner', 24, 'Douglas', False, 'F'],
       ['Amy', 'Cooze', 73, 'Boston', True, nan]], dtype=object)

In [26]:
type(df.values)

numpy.ndarray

In [27]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [28]:
df.to_csv()  # csv 변환

',first_name,last_name,age,city,debt,sex\n0,Jason,Miller,42,San Francisco,True,M\n1,Molly,Jacobson,52,Baltimore,True,F\n2,Tina,Ali,36,Miami,False,\n3,Jake,Milner,24,Douglas,False,F\n4,Amy,Cooze,73,Boston,True,\n'

In [29]:
df

Unnamed: 0,first_name,last_name,age,city,debt,sex
0,Jason,Miller,42,San Francisco,True,M
1,Molly,Jacobson,52,Baltimore,True,F
2,Tina,Ali,36,Miami,False,
3,Jake,Milner,24,Douglas,False,F
4,Amy,Cooze,73,Boston,True,


In [32]:
# column 삭제
# df 자체는 변화 없고 그대로
df.drop("debt", axis=1)  # 해당 column을 axis = 1(column) 기준으로 삭제

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,F
4,Amy,Cooze,73,Boston,


In [33]:
del df["debt"]  # column 삭제, 메모리 주소 자체를 삭제한다

In [34]:
df

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,F
4,Amy,Cooze,73,Boston,


In [35]:
# Example from Python for data analyis

# dict의 key 값: column 값
# 안의 dict의 key 값: index 값
pop = {"Nevada": {2001: 2.4, 2002: 2.9}, "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}}

DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5
