## Codebook을 만들기 위한 함수 제작용 주피터 파일. 
### 본 파일은 'CodebookModule.py'

In [None]:
# input: list with string elements
# output: numpy array

### NLP 복습 - by nltk

참고<br>
https://datascienceschool.net/03%20machine%20learning/03.01.01%20NLTK%20%EC%9E%90%EC%97%B0%EC%96%B4%20%EC%B2%98%EB%A6%AC%20%ED%8C%A8%ED%82%A4%EC%A7%80.html

#### Import & Load Data

In [1]:
# Natural Language Toolkit
!pip install nltk



In [2]:
import nltk
# 연구용 말뭉치 자료 다운로드, 그 중 책 자료. 
# quiet=True: 다운로드 과정에서 출력되는 로그 메세지 숨기기
nltk.download("book", quiet=True)
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
# corpus(말뭉치) 중 gutenberg(저작권이 말소된 문학작품)의 file IDs
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
# 그 중 Jane Austen의 Emma를 보면 다음과 같은 string의 형태이다. 
emma_raw = nltk.corpus.gutenberg.raw("austen-emma.txt")
print(emma_raw[:1302])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had been supplied
by an excellent woman as governess, who had fallen little short
of a mother in affection.

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.  Between _them_ it was more the intimacy
of sisters.  Even before Miss Taylor had ceased to hold the nominal
office of governess, the mildness o

#### Tokenize

In [5]:
# sentence 별로 나누기
from nltk.tokenize import sent_tokenize
print(sent_tokenize(emma_raw[:1000])[3])

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.


In [6]:
# word 단위로 나누기
from nltk.tokenize import word_tokenize
word_tokenize(emma_raw[50:100])

['Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a']

In [7]:
# RegexpTokenizer: 정규표현식(Regular expression)을 이용해 tokenize
from nltk.tokenize import RegexpTokenizer
# \w(alphabet, numbers) 패턴을 사용해 tokenize
retokenize = RegexpTokenizer("[\w]+")
emma_ret = retokenize.tokenize(emma_raw)

#### Morpheme(형태소) 분석

In [8]:
# Stemming: 변화된 단어의 접미사, 어미 제거 -> 형태소의 기본형 복구를 도움(정확히 기본형이 되는건 아님)
from nltk.stem import PorterStemmer, LancasterStemmer

# PorterStemmer, LancasterStemmer라는 두 가지 Stemming 알고리즘이 있음. 
# PorterStemmer: 접두사, 접미사를 제거해 더 일반적인 단어의 어간 생성. 단어의 원형에 더 가까우나 시간이 조금 더 오래 걸림
st1 = PorterStemmer()
# LancasterStemmer: 접두사, 접미사를 더 많이 제거해 짧은 어간을 생성. 시간은 조금 덜 걸림. 
st2 = LancasterStemmer()

words = ["fly", "flies", "flying", "flew", "flown"]

# words에 있는 각각의 words를 st1(Porter algorithm), st2(Lancaster algorithm)으로 stemming
print("Porter Stemmer   :", [st1.stem(w) for w in words])
print("Lancaster Stemmer    :", [st2.stem(w) for w in words])

emma_morph = [st1.stem(w) for w in emma_ret]

Porter Stemmer   : ['fli', 'fli', 'fli', 'flew', 'flown']
Lancaster Stemmer    : ['fly', 'fli', 'fly', 'flew', 'flown']


In [9]:
# Lemmatizing: 같은 의미를 가지는 여러 단어를 사전형으로 통일하는 작업. 품사(POS, Part of speech)를 지정하는 경우 더 정확한 원형을 찾을 수 있다. 
# WordNetLemmatizer: WordNet 데이터베이스를 기반으로 Lemmatize
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()

# words의 원소 w들 중 pos="v"(동사)인 단어에 대해 동사 원형을 추출한다. 
[lm.lemmatize(w, pos="v") for w in words]

emma_tokens = [lm.lemmatize(w) for w in emma_morph]

#### 품사 부착
NLTK에서는 'Penn Treebank Tagset'을 사용해 품사를 부착한다. <br>
- NNP: 단수 고유명사
- VB: 동사
- VBP: 동사 현재형
- TO: to 전치사
- NN: 명사(단수형 혹은 집합형)
- DT: 관형사
...

In [10]:
# pos_tag: (단어 토큰, 품사)의 튜플 출력
from nltk.tag import pos_tag
sentence = "Emma refused to permit us to obtain the refuse permit"
tagged_list = pos_tag(word_tokenize(sentence))
tagged_list

[('Emma', 'NNP'),
 ('refused', 'VBD'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [11]:
# untag: 품사가 tagging된 튜플 제거
from nltk.tag import untag
untag(tagged_list)

['Emma',
 'refused',
 'to',
 'permit',
 'us',
 'to',
 'obtain',
 'the',
 'refuse',
 'permit']

In [12]:
# 같은 단어더라도 품사가 다르면 다른 토큰으로 처리해야 할 때(ex. Scikit-Learn)
def tokenizer(doc):
    # 튜플의 원소 사이에 "/" 를 넣어 합친다. 
    return ["/".join(p) for p in tagged_list]

tokenizer(sentence)

['Emma/NNP',
 'refused/VBD',
 'to/TO',
 'permit/VB',
 'us/PRP',
 'to/TO',
 'obtain/VB',
 'the/DT',
 'refuse/NN',
 'permit/NN']

#### Data Loading

In [13]:
token_data = emma_tokens
token_data

['emma',
 'by',
 'jane',
 'austen',
 '1816',
 'volum',
 'i',
 'chapter',
 'i',
 'emma',
 'woodhous',
 'handsom',
 'clever',
 'and',
 'rich',
 'with',
 'a',
 'comfort',
 'home',
 'and',
 'happi',
 'disposit',
 'seem',
 'to',
 'unit',
 'some',
 'of',
 'the',
 'best',
 'bless',
 'of',
 'exist',
 'and',
 'had',
 'live',
 'nearli',
 'twenti',
 'one',
 'year',
 'in',
 'the',
 'world',
 'with',
 'veri',
 'littl',
 'to',
 'distress',
 'or',
 'vex',
 'her',
 'she',
 'wa',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughter',
 'of',
 'a',
 'most',
 'affection',
 'indulg',
 'father',
 'and',
 'had',
 'in',
 'consequ',
 'of',
 'her',
 'sister',
 's',
 'marriag',
 'been',
 'mistress',
 'of',
 'hi',
 'hous',
 'from',
 'a',
 'veri',
 'earli',
 'period',
 'her',
 'mother',
 'had',
 'die',
 'too',
 'long',
 'ago',
 'for',
 'her',
 'to',
 'have',
 'more',
 'than',
 'an',
 'indistinct',
 'remembr',
 'of',
 'her',
 'caress',
 'and',
 'her',
 'place',
 'had',
 'been',
 'suppli',
 'by',
 'an',
 'excel',


### 1. Horizontal Sliding

In [47]:
hor = {'hor1' : 'qwertyuiop', 'hor2' : 'asdfghjkl', 'hor3' : 'zxcvbnm'}
coded_data = []

In [87]:
ht = 'qwertyuiop'
ct = ''
endpoint = 7
startpoint = 3
tl = list(range(endpoint, startpoint-1, -1 if endpoint > startpoint else 1))
ct += ht[i for i in tl]


SyntaxError: invalid syntax (3789968663.py, line 6)

In [54]:
import pandas as pd
a = [1,2,3]
b = [4,5,6]
ab = {a[i] : b[i] for i in range(len(a))}
ab

{1: 4, 2: 5, 3: 6}

In [111]:
token_data = ['arithmeticerror'] # artyuiuythmertyuicerrtyuioiuytr

for token in token_data:
    coded_temp = ''
    for a_index in range(len(token)):
        if a_index + 1 == len(token):
            coded_data.append(coded_temp + token[a_index])
            break
        # 1. 연속된 두 단어가 같은 horizontal line 상에 있는지 확인
        if (token[a_index] in hor['hor1']) & (token[a_index + 1] in hor['hor1']):
            # startpoint, endpoint에 각각의 인덱스 저장
            startpoint = hor['hor1'].index(token[a_index])
            endpoint = hor['hor1'].index(token[a_index + 1])
            if startpoint == endpoint:
                coded_temp += hor['hor1'][startpoint]
                continue
            for i in range(startpoint, endpoint, 1 if endpoint > startpoint else -1):
                coded_temp += hor['hor1'][i]
        elif (token[a_index] in hor['hor2']) & (token[a_index + 1] in hor['hor2']):
            # startpoint, endpoint에 각각의 인덱스 저장
            startpoint = hor['hor2'].index(token[a_index])
            endpoint = hor['hor2'].index(token[a_index + 1])
            if startpoint == endpoint:
                coded_temp += hor['hor1'][startpoint]
                continue
            for i in range(startpoint, endpoint, 1 if endpoint > startpoint else -1):
                coded_temp += hor['hor2'][i]
        elif (token[a_index] in hor['hor3']) & (token[a_index + 1] in hor['hor3']):
            # startpoint, endpoint에 각각의 인덱스 저장
            startpoint = hor['hor3'].index(token[a_index])
            endpoint = hor['hor3'].index(token[a_index + 1])
            if startpoint == endpoint:
                coded_temp += hor['hor1'][startpoint]
                continue
            for i in range(startpoint, endpoint, 1 if endpoint > startpoint else -1):
                coded_temp += hor['hor3'][i]
        else:
            coded_temp += token[a_index]

# 2-1. 없으면 그대로 저장

# 2-2. 있으면 사이에 sliding 추가
    

In [122]:
coded_data

['aiuyttyuthmtriuyicroiuytrtyui',
 'aiuyttyuthmtriuyicroiuytrtyui',
 'aiuyttyuthmtriuyicroiuytrtyui',
 'artyuiuythmertyuicertyuioiuyt',
 'artyuiuythmertyuicertyuioiuyt',
 'artyuiuythmertyuicertyuioiuyt',
 'artyuiuythmertyuicertyuioiuytr',
 'artyuiuythmertyuicertyuioiuytr',
 'artyuiuythmertyuicerrtyuioiuytr']

In [113]:
print(coded_data[8] == 'artyuiuythmertyuicerrtyuioiuytr')
print(coded_data[8])
print('artyuiuythmertyuicerrtyuioiuytr')

True
artyuiuythmertyuicerrtyuioiuytr
artyuiuythmertyuicerrtyuioiuytr


In [22]:
# 함수화 하기
def Codebook_HorizontalSliding(vocab:list):
    hor = {'hor1' : 'qwertyuiop', 'hor2' : 'asdfghjkl', 'hor3' : 'zxcvbnm'}
    coded_data = []
    for token in vocab:
        coded_temp = ''
        for a_index in range(len(token)):
            # out of range 처리
            if a_index + 1 == len(token):
                coded_data.append(coded_temp + token[a_index])
                break
            # 1. 연속된 두 단어가 같은 horizontal line 상에 있는지 확인
            if (token[a_index] in hor['hor1']) & (token[a_index + 1] in hor['hor1']):
                # startpoint, endpoint에 각각의 인덱스 저장
                startpoint = hor['hor1'].index(token[a_index])
                endpoint = hor['hor1'].index(token[a_index + 1])
                # 예외처리: startpoint, endpoint가 같은 경우
                if startpoint == endpoint:
                    coded_temp += token[a_index]
                    continue
                for i in range(startpoint, endpoint, 1 if endpoint > startpoint else -1):
                    coded_temp += hor['hor1'][i]
            elif (token[a_index] in hor['hor2']) & (token[a_index + 1] in hor['hor2']):
                # startpoint, endpoint에 각각의 인덱스 저장
                startpoint = hor['hor2'].index(token[a_index])
                endpoint = hor['hor2'].index(token[a_index + 1])
                # 예외처리: startpoint, endpoint가 같은 경우
                if startpoint == endpoint:
                    coded_temp += token[a_index]
                    continue
                for i in range(startpoint, endpoint, 1 if endpoint > startpoint else -1):
                    coded_temp += hor['hor2'][i]
            elif (token[a_index] in hor['hor3']) & (token[a_index + 1] in hor['hor3']):
                # startpoint, endpoint에 각각의 인덱스 저장
                startpoint = hor['hor3'].index(token[a_index])
                endpoint = hor['hor3'].index(token[a_index + 1])
                # 예외처리: startpoint, endpoint가 같은 경우
                if startpoint == endpoint:
                    coded_temp += token[a_index]
                    continue
                for i in range(startpoint, endpoint, 1 if endpoint > startpoint else -1):
                    coded_temp += hor['hor3'][i]
            else:
                coded_temp += token[a_index]
    codebook = {vocab[i] : coded_data[i] for i in range(len(vocab))}
    return codebook

In [23]:
Codebook_HorizontalSliding(emma_tokens)

{'emma': 'emma',
 'by': 'by',
 'jane': 'jhgfdsane',
 'austen': 'austren',
 '1816': '1816',
 'volum': 'volum',
 'i': 'i',
 'chapter': 'chgfdsapoiuytrer',
 'woodhous': 'wertyuioodfghoius',
 'handsom': 'hgfdsandsom',
 'clever': 'clever',
 'and': 'and',
 'rich': 'rtyuich',
 'with': 'wertyuiuyth',
 'a': 'a',
 'comfort': 'comfoiuytrt',
 'home': 'home',
 'happi': 'hgfdsappoi',
 'disposit': 'disposiuyt',
 'seem': 'seem',
 'to': 'tyuio',
 'unit': 'uniuyt',
 'some': 'some',
 'of': 'of',
 'the': 'the',
 'best': 'best',
 'bless': 'bless',
 'exist': 'exist',
 'had': 'hgfdsasd',
 'live': 'live',
 'nearli': 'nearli',
 'twenti': 'trewentyui',
 'one': 'one',
 'year': 'ytrear',
 'in': 'in',
 'world': 'wertyuioiuytrlkjhgfd',
 'veri': 'vertyui',
 'littl': 'liuyttl',
 'distress': 'distress',
 'or': 'oiuytr',
 'vex': 'vex',
 'her': 'her',
 'she': 'sdfghe',
 'wa': 'wa',
 'youngest': 'yuioiungest',
 'two': 'trewertyuio',
 'daughter': 'dsaughtrer',
 'most': 'most',
 'affection': 'asdffectyuion',
 'indulg': 'in

### 2. 모음대체 - QWERTY...

### 3. 모음대체 - ASDF...

### 4. 모음대체 - User가 입력한 숫자

### 5. 모음대체 - User가 입력한 키워드

### 6. 모음대체 - User가 입력한 숫자, 키워드(Upper)