In [1]:
import pandas as pd
import numpy as np

In [347]:
data = [['1', '1.11', 'one'],['2','', 'two'],['누락','3.33','three']]

df = pd.DataFrame(data, columns=['c1','c2','c3'])
type(df[["c1", "c2"]])


pandas.core.frame.DataFrame

In [26]:
df.to_csv('sample1.csv', index=False, encoding='UTF-8')

In [27]:
df_read = pd.read_csv('sample1.csv')
df_read

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,,two
2,누락,3.33,three


In [28]:
df.to_csv('sample2.csv', index=False, header=False)

In [30]:
pd.read_csv('sample2.csv', names=['c1','c2','c3'])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,,two
2,누락,3.33,three


In [31]:
%%writefile sample3.txt
c1        c2        c3        c4
0.179181 -1.538472  1.347553  0.43381
1.024209  0.087307 -1.281997  0.49265
0.417899 -2.002308  0.255245 -1.10515

Writing sample3.txt


In [34]:
pd.read_table('sample3.txt', sep='\s+')

Unnamed: 0,c1,c2,c3,c4
0,0.179181,-1.538472,1.347553,0.43381
1,1.024209,0.087307,-1.281997,0.49265
2,0.417899,-2.002308,0.255245,-1.10515


In [35]:
%%writefile sample4.txt
파일 제목: sample4.txt
데이터 포맷의 설명:
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample4.txt


In [37]:
pd.read_csv('sample4.txt', skiprows=[0,1]) # skiprows -> range가능

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [38]:
df_na_val = pd.read_csv('sample1.csv', na_values=['누락'])
df_na_val

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


In [40]:
df_na_val.to_csv('sample6.csv', na_rep='누락')

In [40]:
df_na_val.to_csv('sample6.csv', na_rep='누락')

In [44]:
titanic = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
titanic.head(10)

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
5,0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
6,1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
7,1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
8,1,female,4.0,1,1,16.7,Third,G,Southampton,n
9,0,male,20.0,0,0,8.05,Third,unknown,Southampton,y


In [53]:
data = np.arange(10,22).reshape(3,4)

df=pd.DataFrame(data,['a','b','c'],['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [55]:
# 만약 loc 인덱서를 사용하면서 인덱스를 콤마 없이 하나만 넣으면 row을 선택합니다.

# 인덱스 데이터가 'a'인 행을 고르면 해당하는 row가 Series로 반환됩니다. Series라서 상하로 길게 표현되기는 했지만 row를 가져오고 있습니다

df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int64

In [56]:
df.loc['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [57]:
df['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [59]:
df.loc[['a','c']]

Unnamed: 0,A,B,C,D
a,10,11,12,13
c,18,19,20,21


In [60]:
# Boolean Seriese로 row를 기준으로 인덱싱할 수 있습니다.
df.A > 15

a    False
b    False
c     True
Name: A, dtype: bool

In [311]:
df.loc[df.A>15]

AttributeError: 'DataFrame' object has no attribute 'A'

In [63]:
# callable한 함수를 만들어서 인덱싱하는데 사용할 수 있습니다. 
def select_rows(df, num):
    return df.A > num

select_rows(df, 10)

a    False
b     True
c     True
Name: A, dtype: bool

In [64]:
df.loc[select_rows(df,10)]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [None]:
# loc 인덱서는 label 인덱싱이나 column label 리스트 인덱싱을 불가능합니다.

In [65]:
df2 = pd.DataFrame(np.arange(10,26).reshape(4,4), columns = ['A','B','C','D'])
df2

Unnamed: 0,A,B,C,D
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [68]:
# loc 사용시 슬라이싱 할때 뒤에 숫자를 포함
df2.loc[1:2]

Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


In [71]:
# iloc은 loc과 다르게 슬라이싱 할때 뒤에 숫자를 포함하지 않는다
df.iloc[1:2]

Unnamed: 0,A,B,C,D
b,14,15,16,17


In [72]:
# 인덱싱 값을 row와 column 모두 받으려면 df.loc[row인덱스 , column인덱스]와 같은 형태로 사용합니다.
# row 인덱스 label값이 a, column 인덱스 label 값이 A인 위치의 값을 구하는 것은 다음과 같습니다.

df.loc['a','A']

10

In [73]:
# 앞서 본 콤마로 구분된 인덱싱 값으로 label 데이터의 슬라이싱 또는 리스트도 사용할 수도 있습니다.
df.loc['b':, 'A']

b    14
c    18
Name: A, dtype: int64

In [74]:
df.loc['a',:]

A    10
B    11
C    12
D    13
Name: a, dtype: int64

In [76]:
df.loc[['a','b'],['B','D']] # 순서는 [row , column]

Unnamed: 0,B,D
a,11,13
b,15,17


In [79]:
df.loc[df.A >10 ,['C','D']]

Unnamed: 0,C,D
b,16,17
c,20,21


In [78]:
df.iloc[0,1]

11

In [80]:
df.iloc[:2,2]

a    12
b    16
Name: C, dtype: int64

In [81]:
df.iloc[0,-2:]

C    12
D    13
Name: a, dtype: int64

In [82]:
df.iloc[2:3, 1:3]

Unnamed: 0,B,C
c,19,20


In [83]:
# loc 인덱서와 마찬가지로 인덱스가 하나만 들어가면 행을 선택합니다.

df.iloc[-1]

A    18
B    19
C    20
D    21
Name: c, dtype: int64

In [84]:
df.iloc[-1] = df.iloc[-1]*2
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,36,38,40,42


In [85]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [88]:
# count 메서드는 NaN은 세지 않는다
s.count()

9

In [89]:
# len 함수는 NaN을 센다.
len(s) 

10

In [102]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size=(4,4)) ,dtype = 'float')
df.iloc[2,3] = np.nan
print(df)
df.count()

     0    1    2    3
0  0.0  0.0  3.0  2.0
1  3.0  0.0  2.0  1.0
2  3.0  2.0  4.0  NaN
3  4.0  3.0  4.0  2.0


0    4
1    4
2    4
3    3
dtype: int64

In [94]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [103]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [107]:
np.random.seed(1)
s2 = pd.Series(np.random.randint(6, size=100))
s2.tail()

95    4
96    5
97    2
98    4
99    3
dtype: int64

In [110]:
# value_counts() 메서드로 각각의 값이 나온 횟수를 셀 수 있습니다.
s2.value_counts()

1    22
0    18
4    17
5    16
3    14
2    13
dtype: int64

In [111]:
# DataFrame 에는 value_counts 메서드가 없으므로 각 column마다 별도로 적용해야합니다.

np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size=(4,4)), dtype = float)
df.iloc[2,3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [112]:
df[0].value_counts()

3.0    2
0.0    1
4.0    1
Name: 0, dtype: int64

In [119]:
# 데이트를 index 순으로 정렬하려면 sort_index()를 , value기준으로 정렬하려면 sort_values() 메서드를 사용합니다.

print(s2.value_counts().sort_index())
print(s2.value_counts().sort_values())

0    18
1    22
2    13
3    14
4    17
5    16
dtype: int64
2    13
3    14
5    16
4    17
0    18
1    22
dtype: int64


In [120]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [122]:
# NaN 값은 가장 나중으로 위치하게 된다.
s.sort_values()

0    0.0
1    1.0
2    2.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
3    NaN
dtype: float64

In [124]:
# ascending = False 를 사용하면 내림차순으로 변경 가능하다.
s.sort_values(ascending=False)

9    9.0
8    8.0
7    7.0
6    6.0
5    5.0
4    4.0
2    2.0
1    1.0
0    0.0
3    NaN
dtype: float64

In [126]:
# by 키워드 인수를 활용하여 DataFrame의 정렬의 기준이 되는 column을 지정해 주어야 한다.
df.sort_values(by=1)

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [128]:
# 1번째 coulumn 기준으로 먼저 정렬하고 2번째 column을 기준으로 정렬한다.
df.sort_values(by=[1,2])

Unnamed: 0,0,1,2,3
1,3.0,0.0,2.0,1.0
0,0.0,0.0,3.0,2.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [153]:
titanic['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [175]:
titanic['age'].value_counts(ascending=False)

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [165]:
titanic['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [161]:
titanic['alive'].value_counts()

no     549
yes    342
Name: alive, dtype: int64

In [197]:
titanic.value_counts(['sex','alive','class']).head(10)

sex     alive  class 
male    no     Third     300
female  yes    First      91
male    no     Second     91
               First      77
female  no     Third      72
        yes    Third      72
               Second     70
male    yes    Third      47
               First      45
               Second     17
dtype: int64

In [198]:
np.random.seed(1)
df2 = pd.DataFrame(np.random.randint(10, size=(4,4)))
df2

Unnamed: 0,0,1,2,3
0,5,8,9,5
1,0,0,1,7
2,6,9,2,4
3,5,2,4,2


In [199]:
df2.sum(axis=1)

0    27
1     8
2    21
3    13
dtype: int64

In [200]:
df2['RowSum']= df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,RowSum
0,5,8,9,5,27
1,0,0,1,7,8
2,6,9,2,4,21
3,5,2,4,2,13


In [202]:
df2.loc['ColTotal2']= df2.sum()
df2

Unnamed: 0,0,1,2,3,RowSum
0,5,8,9,5,27
1,0,0,1,7,8
2,6,9,2,4,21
3,5,2,4,2,13
ColTotal2,16,19,16,18,69


In [208]:
df2.loc['Colmean', :] = df2.mean()
df2

Unnamed: 0,0,1,2,3,RowSum
0,5.0,8.0,9.0,5.0,27.0
1,0.0,0.0,1.0,7.0,8.0
2,6.0,9.0,2.0,4.0,21.0
3,5.0,2.0,4.0,2.0,13.0
ColTotal2,16.0,19.0,16.0,18.0,69.0
ColTotal,6.4,7.6,6.4,7.2,27.6
Colmean,6.4,7.6,6.4,7.2,27.6


In [216]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [256]:
round(titanic['age'].mean(),1)

29.7

In [293]:
round(titanic.loc[titanic.sex == 'female',['age']].mean(),1)

age    27.9
dtype: float64

In [336]:
round(titanic['age'].loc[titanic.sex == 'female'].mean(),1)

27.9

In [345]:
round(titanic[(titanic['sex']=='female')]['age'].mean(),1)

27.9

In [338]:
round(titanic[(titanic['pclass']==1) & (titanic['sex'] == 'female')]['age'].mean(),1)

34.6

In [334]:
round(titanic['age'].loc[titanic['sex']=='female'].loc[titanic['pclass']==1].mean(),1)

34.6