#### 정규표현식(Regular expression)
- 특정한 패턴과 일치하는 문자열을 검색, 치환, 제거하는 기능 지원
- re 모듈

- 메타 문자 : . ^ $ * + ? {} [] \ | ()
- 정규식을 이용한 문자열 검색
    - match() : 문자열의 처음부터 정규식과 매치되는지 검사
    - search() : 문자열 전체를 검색하여 정규식과 매치되는지 조사
    - findall() : 정규식과 매치되는 모든 문자열을 리스트로 리턴
    - finditer() : 정규식과 매치되는 모든 문자열을 iterator 객체로 리턴


```
- * : 반복, 0 ~ 무한대
- ca*t : ct, cat, caaaaaat

- + : 반복, 1 ~ 무한대
- ca*t : ct(X), cat, caaaaaat

- {m} : m - 원하는 반복 횟수
- ca{2}t : cat(X), caat

- {m,n} : m - 원하는 반복 횟수, n - 최대 반복 횟수
- ca{2,5}t : cat(X), caat, caaaaat

- ? : 0~1
- ab?c : abc, ac
```

In [18]:
# 파이썬에서는 정규식 중요
#  [abc] :문자열에 a or b or c 가 매치되는지 찾기

import re

# 원본 문자열
string = "a"
# 정규표현식 생성
p = re.compile("[abc]") # 자바스크립트 /abc/

m = p.match(string)
print(m)

<re.Match object; span=(0, 1), match='a'>


In [19]:
# . : \n 을 제외한 모든 문자

p = re.compile("D.A")
m = p.match("DAA")
print(m)
print(m.start())
print(m.end())
print(m.group())
print(m.span())

<re.Match object; span=(0, 3), match='DAA'>
0
3
DAA
(0, 3)


In [20]:
p = re.compile("ab?c")
m = p.match("abc")
print(m)

m = p.match("ac")
print(m)

p = re.compile("ab*c")
m = p.match("ac")
print(m)

p = re.compile("ab*c")
m = p.match("abbbbbbbbbbc")
print(m)

p = re.compile("ab+c")
m = p.match("ac")
print(m)

p = re.compile("ab+c")
m = p.match("abbbbbbbbbbbbc")
print(m)

p = re.compile("ab{2,5}c")
m = p.match("abc")
print(m)


p = re.compile("ab{2,5}c")
m = p.match("abbbbbc")
print(m)


<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 2), match='ac'>
<re.Match object; span=(0, 2), match='ac'>
<re.Match object; span=(0, 12), match='abbbbbbbbbbc'>
None
<re.Match object; span=(0, 14), match='abbbbbbbbbbbbc'>
None
<re.Match object; span=(0, 7), match='abbbbbc'>


In [21]:
# [a-zA-Z] : 모든 알파벳
# [0~9] : 모든 숫자
# [bcd]
# [^bcd] : bcd를 제외한

p = re.compile("[a-gA-G]")
m = p.match("abc1234")
print(m)

p = re.compile("[a-gA-G0-9]")
m = p.match("abc1234")
print(m)

p = re.compile("[^a-gA-G0-9]")
m = p.match("abc1234")
print(m)





<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='a'>
None


- 자주 사용하는 문자 클래스
    - \d == [0-9], \D == [^0-9]
    - \w == [a-zA-Z0-9], \w == [^a-zA-Z0-9]
    - \s == [\t\n\r\f\v],\S == [^\t\n\r\f\v] (공백)

In [22]:
p = re.compile(r"[\w]+")
m = p.match("abc1234")
print(m)

<re.Match object; span=(0, 7), match='abc1234'>


In [23]:
# [.] or \. : 마침표 자체를 의미(모든 문자와 대응되는 개념이 아님)

p = re.compile(r"a[.]b")
m = p.match("acb")
print(m)
m = p.match("a.b")
print(m)

None
<re.Match object; span=(0, 3), match='a.b'>


In [37]:

# match : 문자열 처음부터 정규식과 매치되는지 조사
# search : 문자열 전체 검사

p = re.compile("[a-z]+")

m = p.match("3 python")
print(m)

m = p.search("3 python")
print(m)


None
<re.Match object; span=(2, 8), match='python'>


In [38]:
result  = p.findall("Game of Life is too short")
print(result)

result  = p.finditer("Game of Life is too short")
print(result)
for r in result:
    print(r.group())

['ame', 'of', 'ife', 'is', 'too', 'short']
<callable_iterator object at 0x00000237389B0640>
ame
of
ife
is
too
short


In [26]:
m = re.match(r"[a-z]+","abcdef")
print(m)

<re.Match object; span=(0, 6), match='abcdef'>


In [39]:
string = "python VS java"

p = re.compile(" [A-Z]{2} ")
splited = p.split(string)
print(splited)

['python', 'java']


In [28]:
string = "python VS java"
string.split(" VS ")

['python', 'java']

In [29]:
string = "801210-1011233"
# - => 변경 후 출력

# string.replace("-", "*")

re.sub("-", "*", string)

'801210*1011233'

In [30]:
# data_k
# 주민등록번호 뒷자리를 **************

from openpyxl import load_workbook
wb = load_workbook("./data_kr.xlsx")
ws = wb.active

p = re.compile("[0-9]{7}")

for row in ws.rows:
    #print(row[1].value)
    print(re.sub(p,"*******",row[1].value))
    

주민등록번호
800215-*******
821030-*******
841230-*******
790903-*******
800125-*******
820612-*******


In [31]:
# 태그제거 후 아이폰 문자 남기기
# 태그가 다양하게 적용하려면 패턴 생성을 어떻게 할 것인가
string = "<b>아이폰</b>"

p = re.compile("<.+?>")
re.sub(p,"",string)

'아이폰'

In [40]:
string = 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'

p = re.compile(r"Mr\.")
# p.search(string)
p.findall(string)

list1 = []

if list1:
    print("True")
else:
    print("False")

False


In [33]:
# train.xlsx 읽은 후 이름에 Mr. 포함된 사람 출력

wb = load_workbook("./train.xlsx")
ws = wb.active

p = re.compile(r"Mr\.")

for row in ws.iter_rows(min_row=2):
    #print(row[3].value)
    name = row[3].value

    matched = p.findall(name) #['Mr.'] or []
    if matched:
        print(name)


wb.close()

Braund, Mr. Owen Harris
Allen, Mr. William Henry
Moran, Mr. James
McCarthy, Mr. Timothy J
Saundercock, Mr. William Henry
Andersson, Mr. Anders Johan
Williams, Mr. Charles Eugene
Fynney, Mr. Joseph J
Beesley, Mr. Lawrence
Sloper, Mr. William Thompson
Emir, Mr. Farred Chehab
Fortune, Mr. Charles Alexander
Todoroff, Mr. Lalio
Wheadon, Mr. Edward H
Meyer, Mr. Edgar Joseph
Holverson, Mr. Alexander Oskar
Mamee, Mr. Hanna
Cann, Mr. Ernest Charles
Kraeff, Mr. Theodor
Rogers, Mr. William John
Lennon, Mr. Denis
Samaan, Mr. Youssef
Nosworthy, Mr. Richard Cater
Ostby, Mr. Engelhart Cornelius
Woolner, Mr. Hugh
Novel, Mr. Mansouer
Sirayanian, Mr. Orsen
Harris, Mr. Henry Birkhardt
Stewart, Mr. Albert A
Crease, Mr. Ernest James
Kink, Mr. Vincenz
Jenkin, Mr. Stephen Curnow
Hood, Mr. Ambrose Jr
Chronopoulos, Mr. Apostolos
Bing, Mr. Lee
Moen, Mr. Sigurd Hansen
Staneff, Mr. Ivan
Moutal, Mr. Rahamin Haim
Waelens, Mr. Achille
Sheerlinck, Mr. Jan Baptist
Carrau, Mr. Francisco M
Ford, Mr. William Neal
Slocovs

In [34]:
from openpyxl import Workbook, load_workbook
import re
# Mr., Miss., Mrs., 없는 경우
# 남자, 미혼여성, 기혼여성, 기타 시트 생성 후 train_gender.xlsx 저장
wb = load_workbook("./train.xlsx")
ws = wb.active

# 새로운 엑셀 생성
wb1 = Workbook()
ws1 = wb1.active
ws1.title = "남성"
ws2 = wb1.create_sheet(title="미혼여성")
ws3 = wb1.create_sheet(title="기혼여성")
ws4 = wb1.create_sheet(title="기타")
p = re.compile(r"[a-zA-Z]+\.")

#titles = []

# 제목 행 옮기기
for row in ws.iter_rows():
    if row[0].row == 1:
        # for col in row:
        #     titles.append(col.value)
        titles = [col.value for col in row]
        ws1.append(titles)
        ws2.append(titles)
        ws3.append(titles)
        ws4.append(titles)
    else:
        name = row[3].value
        matched = p.search(name).group()
        if matched == "Mr.":
            ws1.append([col.value for col in row])
        elif matched == "Miss." :
            ws2.append([col.value for col in row])
        elif matched == "Mrs.":
            ws3.append([col.value for col in row])
        else:
            ws4.append([col.value for col in row])

wb1.save("./train_gender.xlsx")

wb1.close()
wb.close()

In [35]:
# list comprehension
# 1~100 생성
# list1 = []
# for i in range(1,101):
#     list1.append(i)

list1 = [i for i in range(1,101)]
list1

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100]

In [36]:
#보고서 시트
#분류, 생존자수, 사망자수, 생존률
#생존자수 => survived ==1
#사망자수 => survived ==0
#생존률 => 생존자수/생존자수+사망자수 * 100

from openpyxl import Workbook, load_workbook
import re
# Mr., Miss., Mrs., 없는 경우
# 남자, 미혼여성, 기혼여성, 기타 시트 생성 후 train_gender.xlsx 저장
wb = load_workbook("./train.xlsx")
ws = wb.active

# 새로운 엑셀 생성
wb1 = Workbook()
ws1 = wb1.active
ws1.title = "남성"
ws2 = wb1.create_sheet(title="미혼여성")
ws3 = wb1.create_sheet(title="기혼여성")
ws4 = wb1.create_sheet(title="기타")
ws5 = wb1.create_sheet(title="보고서")
p = re.compile(r"[a-zA-Z]+\.")

# 생존자 수, 사망자 수 변수
man_survived, man_unsurvived = 0,0
single_survived, single_unsurvived = 0,0
married_survived, married_unsurvived = 0,0
other_survived, other_unsurvived = 0,0

#titles = []

# 제목 행 옮기기
for row in ws.iter_rows():
    if row[0].row == 1:
        # for col in row:
        #     titles.append(col.value)
        titles = [col.value for col in row]
        ws1.append(titles)
        ws2.append(titles)
        ws3.append(titles)
        ws4.append(titles)
    else:
        name = row[3].value
        matched = p.search(name).group()
        if matched == "Mr.":
            ws1.append([col.value for col in row])
            # survived 값 확인
            if row[1].value == 1:
                man_survived += 1
            else: 
                man_unsurvived += 1
        elif matched == "Miss." :
            ws2.append([col.value for col in row])
            if row[1].value == 1:
                single_survived += 1
            else: 
                single_unsurvived += 1
        elif matched == "Mrs.":
            ws3.append([col.value for col in row])
            if row[1].value == 1:
                married_survived += 1
            else: 
                married_unsurvived += 1
        else:
            ws4.append([col.value for col in row])
            if row[1].value == 1:
                other_survived += 1
            else: 
                other_unsurvived += 1

# 보고서 시트 작성
ws5.append(["분류", "생존자수", "사망자수", "생존률"])
# 생존률
rate1 = "%.2f%%" % (man_survived / (man_survived + man_unsurvived) * 100)
ws5.append(["남성",man_survived, man_unsurvived, rate1])
rate2 = "%.2f%%" % (single_survived / (single_survived + single_unsurvived) * 100)
ws5.append(["미혼여성",single_survived, single_unsurvived, rate2])
rate3 = "%.2f%%" % (married_survived / (married_survived + married_unsurvived) * 100)
ws5.append(["기혼여성",married_survived, married_unsurvived, rate3])
rate4 = "%.2f%%" % (other_survived / (other_survived + other_unsurvived) * 100)
ws5.append(["기타",other_survived, other_unsurvived, rate4])

wb1.save("./train_gender.xlsx")

wb1.close()
wb.close()