In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = 'https://github.com/pandas-dev/pandas/raw/main/doc/data/'
filename = 'titanic.csv'

df = pd.read_csv(path+filename)

In [2]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

* Dataframe Series의 타입 바꾸기

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html#pandas.DataFrame.astype

In [3]:
df.astype({'Name': 'string'}).dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            string
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

- Dataframe에서 문자열 함수 사용하기

https://pandas.pydata.org/docs/reference/api/pandas.Series.str.html#

In [4]:
df.Name = df.Name.str.upper()
df.Name[0]

'BRAUND, MR. OWEN HARRIS'

In [5]:
df_new = df.Name.str.split()
df_new

0                           [BRAUND,, MR., OWEN, HARRIS]
1      [CUMINGS,, MRS., JOHN, BRADLEY, (FLORENCE, BRI...
2                             [HEIKKINEN,, MISS., LAINA]
3      [FUTRELLE,, MRS., JACQUES, HEATH, (LILY, MAY, ...
4                          [ALLEN,, MR., WILLIAM, HENRY]
                             ...                        
886                            [MONTVILA,, REV., JUOZAS]
887                    [GRAHAM,, MISS., MARGARET, EDITH]
888       [JOHNSTON,, MISS., CATHERINE, HELEN, "CARRIE"]
889                           [BEHR,, MR., KARL, HOWELL]
890                              [DOOLEY,, MR., PATRICK]
Name: Name, Length: 891, dtype: object

- unique value만 출력하기


In [6]:
df.Sex.head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [7]:
df.Sex.unique()

array(['male', 'female'], dtype=object)

- Rollong sum, average

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html#pandas.DataFrame.rolling

In [8]:
df_raw = pd.DataFrame([3, 5, 10, 20])
df_raw

Unnamed: 0,0
0,3
1,5
2,10
3,20


In [9]:
df_runningavg = df_raw.rolling(3)
df_runningavg.sum()

Unnamed: 0,0
0,
1,
2,18.0
3,35.0


In [10]:
df_runningavg.mean()

Unnamed: 0,0
0,
1,
2,6.0
3,11.666667


- Explode

https://pandas.pydata.org/docs/reference/api/pandas.Series.explode.html#pandas.Series.explode

In [11]:
s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]])
s

0    [1, 2, 3]
1          foo
2           []
3       [3, 4]
dtype: object

In [12]:
s.explode()

0      1
0      2
0      3
1    foo
2    NaN
3      3
3      4
dtype: object

In [13]:
df_exp = pd.DataFrame(["Jung Kook, Latto", "Taylor Swift", "BTS, IU"], columns=['artist'])
df_exp

Unnamed: 0,artist
0,"Jung Kook, Latto"
1,Taylor Swift
2,"BTS, IU"


In [14]:
df_exp.explode('artist')

Unnamed: 0,artist
0,"Jung Kook, Latto"
1,Taylor Swift
2,"BTS, IU"


In [15]:
df_exp['artist'].str.split(', ')

0    [Jung Kook, Latto]
1        [Taylor Swift]
2             [BTS, IU]
Name: artist, dtype: object

In [16]:
df_exp['artist'].str.split(', ').explode()

0       Jung Kook
0           Latto
1    Taylor Swift
2             BTS
2              IU
Name: artist, dtype: object

- Filling Na

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html

In [17]:
df_country = pd.DataFrame({'country': ['KR', 'US', np.nan]})
df_country

Unnamed: 0,country
0,KR
1,US
2,


In [18]:
df_country.fillna("Unknown", inplace=True)
df_country

Unnamed: 0,country
0,KR
1,US
2,Unknown


- .any()

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.any.html

In [19]:
df_country.any()

country    True
dtype: bool

In [20]:
(df_country == "JP").any()

country    False
dtype: bool

- 복수의 조건으로 불리언 인덱싱

In [23]:
df_country[df_country == "KR" or df_country == "US"]

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [24]:
df_country[(df_country == "KR") | (df_country == "US")]
# and 조건은 &
# 조건은 반드시 괄호로 묶어줘야 함

Unnamed: 0,country
0,KR
1,US
2,


- 갯수 세기

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.value_counts.html#pandas.DataFrame.value_counts

In [25]:
df_counts = pd.DataFrame({'num_legs': [2, 4, 4, 6],
                          'num_wings': [2, 0, 0, 0]},
                          index=['falcon', 'dog', 'cat', 'ant'])
df_counts

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0
cat,4,0
ant,6,0


In [26]:
df_counts.num_legs.value_counts()

4    2
2    1
6    1
Name: num_legs, dtype: int64

In [27]:
print(df_counts.num_legs.value_counts().index)
print(df_counts.num_legs.value_counts().values)

Int64Index([4, 2, 6], dtype='int64')
[2 1 1]


- apply()

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html#pandas.DataFrame.apply

https://pandas.pydata.org/docs/reference/api/pandas.Series.apply.html#pandas.Series.apply

In [28]:
df_counts

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0
cat,4,0
ant,6,0


In [29]:
def my_func(x):
    return x**2

df_counts["squared_legs"] = df_counts.num_legs.apply(my_func)
df_counts

Unnamed: 0,num_legs,num_wings,squared_legs
falcon,2,2,4
dog,4,0,16
cat,4,0,16
ant,6,0,36


In [30]:
df_counts["squared_wings"] = df_counts.num_wings.apply(lambda x: x**2)
df_counts

Unnamed: 0,num_legs,num_wings,squared_legs,squared_wings
falcon,2,2,4,4
dog,4,0,16,0
cat,4,0,16,0
ant,6,0,36,0


- groupby

https://pandas.pydata.org/docs/reference/api/pandas.Series.groupby.html#pandas.Series.groupby

In [31]:
df_counts.groupby("num_legs")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd9087bdf70>

In [32]:
df_counts.groupby("num_legs").sum()

Unnamed: 0_level_0,num_wings,squared_legs,squared_wings
num_legs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2,4,4
4,0,32,0
6,0,36,0


In [33]:
df_counts.groupby("num_legs").mean()

Unnamed: 0_level_0,num_wings,squared_legs,squared_wings
num_legs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2.0,4.0,4.0
4,0.0,16.0,0.0
6,0.0,36.0,0.0


In [34]:
df_counts.groupby(df_counts.num_legs > 3).sum()

Unnamed: 0_level_0,num_legs,num_wings,squared_legs,squared_wings
num_legs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,2,2,4,4
True,14,0,68,0
