In [1]:
# import packages
from num2words import num2words
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [17]:
# sample text
text = """
The standard 30-year fixed-rate mortgage averaged 6.94% in the week ending May 23, down from last week’s average of 7.02%, according to Freddie Mac data released Thursday. That’s the lowest level since early April and below the key 7% threshold.
After moving sideways throughout March, mortgage rates began to climb in late April as economic data showed that inflation’s cooldown stalled earlier in the year. Mortgage rates track the benchmark 10-year US Treasury yield, which moves in anticipation of the Federal Reserve’s decisions on interest rates. Stubbornly high inflation this year has dashed hopes that the Fed could cut interest rates in the spring or in the summer. But there’s finally been some good news on that front: The Consumer Price Index for April, released last week, showed that inflation didn’t pick up. Bond yields have mostly retreated this month.

“Spring homebuyers received an unexpected windfall this week, as mortgage rates fell below the seven percent threshold for the first time in over a month,” Sam Khater, Freddie Mac’s chief economist, said in a release.

Some Fed officials said earlier this week that they likely won’t raise interest rates again and a few have said they expect to cut rates this year. That bodes well for lower mortgage rates.

But for now, the housing market’s recovery is stagnant. Sales of previously owned homes, which make up the vast majority of the housing market, fell in April for the second month in a row, the National Association of Realtors reported Wednesday. That’s a stark contrast from earlier in the year when sales soared. 
"""
text = str(text)

In [3]:
# download from nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/encore/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/encore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/encore/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

- 전처리 단계
1. 처리 안된 단위계들  
1-1. % -> ' percent'  
1-2. [숫자][알파뱃] or [알파뱃][숫자] -> [숫자] [알파뱃] or [알파뱃] [숫자]  
2. 숫자 처리: num2words 사용 숫자 -> 영어 변환  
2-1. [0-9]+,[0-9]+ -> [int] (ex: 24,000 -> 24000 -> [int])  
2-2. [0-9]+:[0-9]+ -> [int] [int] (ex: 10:13 -> 10 13 -> [int] [int])  
2-3. [0-9]+/[0-9]+ -> [int] [int] (ex: 10/13 -> 10 13 -> [int] [int])  
4. tokenizer로 토큰화 (from nltk.stem import WordNetLemmatizer)  
5. 's -> is   
6. 불용어 제거 (nltk.stopwords)  
7. 토큰이 특수기호로만 되여있는 case 제거  
ex) '.' '\'\'' '\' \''  
8. token 길이 1~2 이하인 것 제거 (이 case는 corpus 상황보고 결정)  

In [35]:
# data processing 
class processing:
    def __init__(self, text):
        self.title = str(text.upper())
        
    def replace_percent(self):
        self.text= re.sub(r'%', ' percent', text)
        return self.text
    
    def split_unit(self):
        def replacer(match):
            if match.group(1) and match.group(2):
                # number+alph
                return f"{match.group(1)} {match.group(2)}"
            elif match.group(3) and match.group(4):
                # alph+number
                return f"{match.group(3)} {match.group(4)}"
            return match.group(0)
        
    def delete_special(self):
        pattern = r'[^a-zA-Z0-9\s]'
        self.text = re.sub(pattern, '', self.text) 
        return self.text
    
    def number_to_word(self):
        find_num = re.findall(r'\d+', self.text)
        for num in find_num:
            self.text = re.sub(num, num2words(num), self.text)

    def process_all(self):
        self.replace_percent()
        self.split_unit()
        self.delete_special()
        self.number_to_word()
        return self.text

In [36]:
# class call
process = processing(text)
processed_text = process.process_all()

In [37]:
print(processed_text)


The standard thirtyyear fixedrate mortgage averaged six hundred and ninety-four percent in the week ending May twenty-three down from last weeks average of seven hundred and two percent according to Freddie Mac data released Thursday Thats the lowest level since early April and below the key seven percent threshold
After moving sideways throughout March mortgage rates began to climb in late April as economic data showed that inflations cooldown stalled earlier in the year Mortgage rates track the benchmark tenyear US Treasury yield which moves in anticipation of the Federal Reserves decisions on interest rates Stubbornly high inflation this year has dashed hopes that the Fed could cut interest rates in the spring or in the summer But theres finally been some good news on that front The Consumer Price Index for April released last week showed that inflation didnt pick up Bond yields have mostly retreated this month

Spring homebuyers received an unexpected windfall this week as mortgag

In [38]:
# tokenization
tokens = word_tokenize(processed_text)

In [39]:
tokens

['The',
 'standard',
 'thirtyyear',
 'fixedrate',
 'mortgage',
 'averaged',
 'six',
 'hundred',
 'and',
 'ninety-four',
 'percent',
 'in',
 'the',
 'week',
 'ending',
 'May',
 'twenty-three',
 'down',
 'from',
 'last',
 'weeks',
 'average',
 'of',
 'seven',
 'hundred',
 'and',
 'two',
 'percent',
 'according',
 'to',
 'Freddie',
 'Mac',
 'data',
 'released',
 'Thursday',
 'Thats',
 'the',
 'lowest',
 'level',
 'since',
 'early',
 'April',
 'and',
 'below',
 'the',
 'key',
 'seven',
 'percent',
 'threshold',
 'After',
 'moving',
 'sideways',
 'throughout',
 'March',
 'mortgage',
 'rates',
 'began',
 'to',
 'climb',
 'in',
 'late',
 'April',
 'as',
 'economic',
 'data',
 'showed',
 'that',
 'inflations',
 'cooldown',
 'stalled',
 'earlier',
 'in',
 'the',
 'year',
 'Mortgage',
 'rates',
 'track',
 'the',
 'benchmark',
 'tenyear',
 'US',
 'Treasury',
 'yield',
 'which',
 'moves',
 'in',
 'anticipation',
 'of',
 'the',
 'Federal',
 'Reserves',
 'decisions',
 'on',
 'interest',
 'rates',
 '

In [41]:
# init stop word
stop_words = set(nltk.corpus.stopwords.words("english"))

In [42]:
# execpt stop word
tokensWSW = [word for word in tokens if word not in stop_words]

In [43]:
tokensWSW

['The',
 'standard',
 'thirtyyear',
 'fixedrate',
 'mortgage',
 'averaged',
 'six',
 'hundred',
 'ninety-four',
 'percent',
 'week',
 'ending',
 'May',
 'twenty-three',
 'last',
 'weeks',
 'average',
 'seven',
 'hundred',
 'two',
 'percent',
 'according',
 'Freddie',
 'Mac',
 'data',
 'released',
 'Thursday',
 'Thats',
 'lowest',
 'level',
 'since',
 'early',
 'April',
 'key',
 'seven',
 'percent',
 'threshold',
 'After',
 'moving',
 'sideways',
 'throughout',
 'March',
 'mortgage',
 'rates',
 'began',
 'climb',
 'late',
 'April',
 'economic',
 'data',
 'showed',
 'inflations',
 'cooldown',
 'stalled',
 'earlier',
 'year',
 'Mortgage',
 'rates',
 'track',
 'benchmark',
 'tenyear',
 'US',
 'Treasury',
 'yield',
 'moves',
 'anticipation',
 'Federal',
 'Reserves',
 'decisions',
 'interest',
 'rates',
 'Stubbornly',
 'high',
 'inflation',
 'year',
 'dashed',
 'hopes',
 'Fed',
 'could',
 'cut',
 'interest',
 'rates',
 'spring',
 'summer',
 'But',
 'theres',
 'finally',
 'good',
 'news',
 'f

In [44]:
# convert words to base form 
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_list = []
for word in tokensWSW:
    lemmatized_list.append(wordnet_lemmatizer.lemmatize(word))

In [45]:
lemmatized_list

['The',
 'standard',
 'thirtyyear',
 'fixedrate',
 'mortgage',
 'averaged',
 'six',
 'hundred',
 'ninety-four',
 'percent',
 'week',
 'ending',
 'May',
 'twenty-three',
 'last',
 'week',
 'average',
 'seven',
 'hundred',
 'two',
 'percent',
 'according',
 'Freddie',
 'Mac',
 'data',
 'released',
 'Thursday',
 'Thats',
 'lowest',
 'level',
 'since',
 'early',
 'April',
 'key',
 'seven',
 'percent',
 'threshold',
 'After',
 'moving',
 'sideways',
 'throughout',
 'March',
 'mortgage',
 'rate',
 'began',
 'climb',
 'late',
 'April',
 'economic',
 'data',
 'showed',
 'inflation',
 'cooldown',
 'stalled',
 'earlier',
 'year',
 'Mortgage',
 'rate',
 'track',
 'benchmark',
 'tenyear',
 'US',
 'Treasury',
 'yield',
 'move',
 'anticipation',
 'Federal',
 'Reserves',
 'decision',
 'interest',
 'rate',
 'Stubbornly',
 'high',
 'inflation',
 'year',
 'dashed',
 'hope',
 'Fed',
 'could',
 'cut',
 'interest',
 'rate',
 'spring',
 'summer',
 'But',
 'there',
 'finally',
 'good',
 'news',
 'front',
 'T