In [1]:
import numpy as np
import pandas as pd

### CSV 파일 읽기

In [2]:
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,name,date,num1,num2
0,String_0513,2019:09:26 09:26:43,66,1
1,String_0514,2019:09:26 09:26:51,36,78
2,String_0515,2019:09:26 09:27:04,68,79
3,String_0516,2019:09:26 09:27:11,35,7
4,String_0517,2019:09:26 09:27:19,81,38


### `str` 메서드 사용
- 파이썬 string에 존재하는 메서드 대부분을 사용할 수 있다

In [3]:
df['name'].str.len()

0      11
1      11
2      11
3      11
4      11
       ..
138    11
139    11
140    11
141    11
142    11
Name: name, Length: 143, dtype: int64

In [4]:
df['name'].str.lower()

0      string_0513
1      string_0514
2      string_0515
3      string_0516
4      string_0517
          ...     
138    string_0651
139    string_0652
140    string_0653
141    string_0654
142    string_0655
Name: name, Length: 143, dtype: object

In [5]:
df['name'].str.upper()

0      STRING_0513
1      STRING_0514
2      STRING_0515
3      STRING_0516
4      STRING_0517
          ...     
138    STRING_0651
139    STRING_0652
140    STRING_0653
141    STRING_0654
142    STRING_0655
Name: name, Length: 143, dtype: object

In [6]:
df['name'].str.split('_')

0      [String, 0513]
1      [String, 0514]
2      [String, 0515]
3      [String, 0516]
4      [String, 0517]
            ...      
138    [String, 0651]
139    [String, 0652]
140    [String, 0653]
141    [String, 0654]
142    [String, 0655]
Name: name, Length: 143, dtype: object

### 정규 표현식 사용
str 속성을 통해 다양한 문자열 처리를 할 수 있다.
- match() : 각각의 원소에 `re.match()`를 호출함

In [7]:
df['name'].str.match(r'.*_05\d{2}')

0       True
1       True
2       True
3       True
4       True
       ...  
138    False
139    False
140    False
141    False
142    False
Name: name, Length: 143, dtype: bool

- extract() : 각각의 원소에 `re.match()`를 호출한다. 리턴값은 매칭된 그룹들

In [8]:
df['name'].str.extract(r'(.*)_05(\d{2})')

Unnamed: 0,0,1
0,String,13
1,String,14
2,String,15
3,String,16
4,String,17
...,...,...
138,,
139,,
140,,
141,,


- findall() : 각각의 원소에 `re.findall()`을 호출한다

In [9]:
df['name'].str.findall('\d{2}')

0      [05, 13]
1      [05, 14]
2      [05, 15]
3      [05, 16]
4      [05, 17]
         ...   
138    [06, 51]
139    [06, 52]
140    [06, 53]
141    [06, 54]
142    [06, 55]
Name: name, Length: 143, dtype: object

- replace() : 매칭되는 패턴을 다른 문자열로 바꾼다

In [10]:
df['name'].str.replace('String', 'Revise')

0      Revise_0513
1      Revise_0514
2      Revise_0515
3      Revise_0516
4      Revise_0517
          ...     
138    Revise_0651
139    Revise_0652
140    Revise_0653
141    Revise_0654
142    Revise_0655
Name: name, Length: 143, dtype: object

### 날짜 데이터 처리하기
- 날짜 타입 데이터로 변경하기

In [11]:
date_df = pd.to_datetime(df['date'], format='%Y:%m:%d %H:%M:%S')
date_df

0     2019-09-26 09:26:43
1     2019-09-26 09:26:51
2     2019-09-26 09:27:04
3     2019-09-26 09:27:11
4     2019-09-26 09:27:19
              ...        
138   2019-09-26 11:27:07
139   2019-09-26 11:28:00
140   2019-09-26 11:28:09
141   2019-09-26 11:28:17
142   2019-09-26 11:28:23
Name: date, Length: 143, dtype: datetime64[ns]

- 날짜 데이터를 다른 문자열 포맷으로 변경하기

In [12]:
date_df.dt.strftime('%Y-%m-%d %H:%M:%S')

0      2019-09-26 09:26:43
1      2019-09-26 09:26:51
2      2019-09-26 09:27:04
3      2019-09-26 09:27:11
4      2019-09-26 09:27:19
              ...         
138    2019-09-26 11:27:07
139    2019-09-26 11:28:00
140    2019-09-26 11:28:09
141    2019-09-26 11:28:17
142    2019-09-26 11:28:23
Name: date, Length: 143, dtype: object