In [32]:
import pandas as pd
import numpy as np

In [5]:
#판다스는 nyumpy를 기초로 한다, Series는 Numpy와 동일한다.
data = pd.Series(np.linspace(0, 1, num=5))
data

0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [6]:
data.values

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [8]:
type(data.values)

numpy.ndarray

In [9]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [12]:
#인덱싱 Numpy 기반
data[1]

0.25

In [13]:
#슬라이싱
data[2:4]

2    0.50
3    0.75
dtype: float64

In [14]:
#마스킹(값의 범위를 확인한다.)
data[(data > 0.1) & (data < 0.6)]

1    0.25
2    0.50
dtype: float64

In [16]:
#데이터 index값을 가져온다.
list(data.keys())

[0, 1, 2, 3, 4]

In [18]:
#튜플형태의 (인덱스,값)으로 표현된다.
list(data.items())

[(0, 0.0), (1, 0.25), (2, 0.5), (3, 0.75), (4, 1.0)]

In [19]:
#index 바꾸기
data.index = ["a","b","c","d","e"]

In [22]:
data

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [24]:
#Series 객체와 loc인덱서, iloc 인덱서의 활용
#loc는 명시적인.
data.loc["a"]

0.0

In [25]:
data.loc["a":"c"]

a    0.00
b    0.25
c    0.50
dtype: float64

In [26]:
data.loc[["a","b"]]

a    0.00
b    0.25
dtype: float64

In [28]:
#묵시적인
data.iloc[0]

0.0

In [29]:
data.iloc[2:4]

c    0.50
d    0.75
dtype: float64

In [30]:
data.iloc[[0,2]]

a    0.0
c    0.5
dtype: float64

In [46]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(10, size=(3,4)), columns=['col1','col2','col3','col4'])
df

Unnamed: 0,col1,col2,col3,col4
0,5,0,3,3
1,7,9,3,5
2,2,4,7,6


In [47]:
df["col1"]

0    5
1    7
2    2
Name: col1, dtype: int32

In [48]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [49]:
df.col2

0    0
1    9
2    4
Name: col2, dtype: int32

In [50]:
df.loc[0]

col1    5
col2    0
col3    3
col4    3
Name: 0, dtype: int32

In [52]:
df.loc[0,"col2":"col3"]

col2    0
col3    3
Name: 0, dtype: int32

In [53]:
df.loc[(df["col2"] > 2) & (df["col3"] < 5 )]

Unnamed: 0,col1,col2,col3,col4
1,7,9,3,5


In [62]:
#컬럼 추가하기
df["total"] = df.sum(axis=1)
df

Unnamed: 0,col1,col2,col3,total
0,5,0,3,16
1,7,9,3,38
2,2,4,7,26


In [63]:
#행과 열 제거
df = df.drop(columns=["total"], axis=1)
df

Unnamed: 0,col1,col2,col3
0,5,0,3
1,7,9,3
2,2,4,7


In [64]:
df = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [np.nan,8,9],
                  [10,np.nan,12]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3
1,4.0,5.0,6
2,,8.0,9
3,10.0,,12


In [67]:
#na값 지우기
df.dropna(axis=1)

Unnamed: 0,2
0,3
1,6
2,9
3,12


In [77]:
#DataFRame 객체의 조인
df1 = pd.DataFrame({'name':['이순신','강감찬','을지문덕','김유신'],
                   'dept':['연구개발','영업','연구개발','인사']})
df2 = pd.DataFrame({'emp_name':['강감찬','을지문덕','이순신','이순신'],
                   'project':['S','D','A','S']})


In [79]:
#innser join 형태
pd.merge(df1,df2, left_on = "name", right_on="emp_name")

Unnamed: 0,name,dept,emp_name,project
0,이순신,연구개발,이순신,A
1,이순신,연구개발,이순신,S
2,강감찬,영업,강감찬,S
3,을지문덕,연구개발,을지문덕,D


In [82]:
#outer join 형태
pd.merge(df1, df2, how="outer", left_on="name", right_on="emp_name")

Unnamed: 0,name,dept,emp_name,project
0,이순신,연구개발,이순신,A
1,이순신,연구개발,이순신,S
2,강감찬,영업,강감찬,S
3,을지문덕,연구개발,을지문덕,D
4,김유신,인사,,


In [83]:
import seaborn as sns
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [85]:
#값을 정렬해라 fare는 오름차순, sex는 내림차순
titanic.sort_values(by=["fare","sex"], ascending=[False, True]).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
88,1,1,female,23.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False
341,1,1,female,24.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False


In [86]:
#group 연산자
titanic.groupby("sex")[["survived"]].aggregate("mean")

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [89]:
titanic.groupby("sex")[["survived"]].aggregate("mean").apply(lambda x: x - x.mean())

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.276565
male,-0.276565


In [90]:
#series값을 unstac은 dataFrame값을 바꾸어 준다.
titanic.groupby(["sex","class"])["survived"].aggregate("mean").unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [91]:
#피벗 테이블 함수 다차원 분석시
titanic.pivot_table("survived", index="sex", columns="class")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447
