In [82]:
# import packages
from num2words import num2words
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sqlalchemy import text
import urllib.parse
import json
import tqdm
import pandas as pd
from babel.numbers import get_currency_name

## DB 연결

In [64]:
# db connection
with open("../conn_db.json", 'r') as f:
    db_conn = json.load(f)
db_password = urllib.parse.quote_plus(db_conn['PASS'])


In [65]:
# MySQL 연결 정보를 기반으로 엔진 생성
engine = sqlalchemy.create_engine(f"mysql://{db_conn['USER']}:{db_password}@{db_conn['HOST']}:{db_conn['PORT']}/{db_conn['DB']}")

In [181]:
# db에서 정보 읽기
news_article = pd.read_sql_query("select * from english_news_lake", con=engine)

## 데이터 전처리

In [76]:
news_article.head(5)

Unnamed: 0,dataSource,title,context
0,BBC,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,BBC,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,BBC,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,BBC,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,BBC,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [182]:
news_article.shape

(59686, 3)

In [183]:
news_article.isnull().sum()

dataSource      0
title           0
context       174
dtype: int64

In [184]:
# null값 제거
news_article = news_article.dropna()

In [186]:
news_article.shape

(59512, 3)

In [3]:
# download from nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/encore/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/encore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/encore/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

- 전처리 단계
1. 처리 안된 단위계들  
1-1. % -> ' percent'  
1-2. [숫자][알파뱃] or [알파뱃][숫자] -> [숫자] [알파뱃] or [알파뱃] [숫자]  
2. 숫자 처리: num2words 사용 숫자 -> 영어 변환  
2-1. [0-9]+,[0-9]+ -> [int] (ex: 24,000 -> 24000 -> [int])  
2-2. [0-9]+:[0-9]+ -> [int] [int] (ex: 10:13 -> 10 13 -> [int] [int])  
2-3. [0-9]+/[0-9]+ -> [int] [int] (ex: 10/13 -> 10 13 -> [int] [int])  
4. tokenizer로 토큰화 (from nltk.stem import WordNetLemmatizer)  
5. 's -> is   
6. 불용어 제거 (nltk.stopwords)  
7. 토큰이 특수기호로만 되여있는 case 제거  > 특수기호 모두 제거
ex) '.' '\'\'' '\' \''  
8. token 길이 1~2 이하인 것 제거 (이 case는 corpus 상황보고 결정)  

- 추가
1. 원화 기호 영어로 수정
2. \n \t 삭제

In [187]:
# data processing 
class processing:
    def __init__(self, text):
        self.text = f"""{text.lower()}"""
        
    def replace_percent(self):
        self.text= re.sub(r'%', ' percent', self.text)
        return self
    
    def replace_currency_symbols(self):
        replacements = {
            '$': get_currency_name(' USD', locale='en'),
            '£': get_currency_name(' GBP', locale='en'),
            '€': get_currency_name(' EUR', locale='en'),
            '¥': get_currency_name(' JPY', locale='en'),
            '₩': get_currency_name(' KRW', locale='en')
        }
        for symbol, name in replacements.items():
            self.text = self.text.replace(symbol, name)
        return self
    
    def split_unit(self):
        def replacer(match):
            if match.group(1) and match.group(2):
                # number+alph
                return f"{match.group(1)} {match.group(2)}"
            elif match.group(3) and match.group(4):
                # alph+number
                return f"{match.group(3)} {match.group(4)}"
            return match.group(0)
        
    def number_to_word(self):
        self.text = re.sub(r'\d+', lambda x: num2words(int(x.group())), self.text)
        return self
    
    def remove_tab_enter(self):
        self.text = re.sub(r'\t|\n', ' ', self.text)
        return self
    
    def delete_special(self):
        pattern = r'[^a-zA-Z0-9\s]'
        self.text = re.sub(pattern, ' ', self.text) 
        return self
    
    def process_all(self):
        if '%' in self.text:
            self.replace_percent()
        elif '$£€¥₩' in self.text:
            self.replace_currency_symbols()
        elif '\n' in self.text or '\t' in self.text:
            self.remove_tab_enter()
        self.split_unit()
        self.number_to_word()
        self.delete_special()
        return self.text

In [188]:
# data processing

news_article['processed_title'] = news_article['title'].apply(lambda x: processing(x).process_all() if pd.notnull(x) else x)
news_article['processed_context'] = news_article['context'].apply(lambda x: processing(x).process_all() if pd.notnull(x) else x)


In [189]:
news_article

Unnamed: 0,dataSource,title,context,processed_title,processed_context
0,BBC,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,ad sales boost time warner profit,quarterly profits at us media giant timewarne...
1,BBC,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,dollar gains on greenspan speech,the dollar has hit its highest level against ...
2,BBC,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,yukos unit buyer faces loan claim,the owners of embattled russian oil giant yuk...
3,BBC,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,high fuel prices hit ba s profits,british airways has blamed high fuel prices f...
4,BBC,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,pernod takeover talk lifts domecq,shares in uk drinks and food firm allied dome...
...,...,...,...,...,...
59681,spaceNews,Kendall lays out Pentagon thinking on future s...,"\nFrank Kendall, the Pentagon’s top acquisitio...",kendall lays out pentagon thinking on future s...,frank kendall the pentagon s top acquisition...
59682,spaceNews,A larger share of NOAA’s declining space budge...,Updated Feb. 10 at 10:18 p.m. Eastern The U.S....,a larger share of noaa s declining space budge...,updated feb ten at ten eighteen p m eastern ...
59683,spaceNews,Think Tank Turns Its Attention To Mars As 2016...,WASHINGTON — As NASA develops a long-term stra...,think tank turns its attention to mars as two ...,washington as nasa develops a long term stra...
59684,spaceNews,House Bill Leaves Last Three JPSS Satellites i...,WASHINGTON — A spending bill the House passed ...,house bill leaves last three jpss satellites i...,washington a spending bill the house passed ...


## 토큰화 및 불용어 처리

In [190]:
# tokenization
title_token = news_article['processed_title'].apply(word_tokenize)
context_token = news_article['processed_context'].apply(word_tokenize)

#for index, raws in news_article.iterrows():
#    tokens = word_tokenize(raws['processed_title'])

In [191]:
title_token

0                 [ad, sales, boost, time, warner, profit]
1                   [dollar, gains, on, greenspan, speech]
2                 [yukos, unit, buyer, faces, loan, claim]
3                [high, fuel, prices, hit, ba, s, profits]
4                  [pernod, takeover, talk, lifts, domecq]
                               ...                        
59681    [kendall, lays, out, pentagon, thinking, on, f...
59682    [a, larger, share, of, noaa, s, declining, spa...
59683    [think, tank, turns, its, attention, to, mars,...
59684    [house, bill, leaves, last, three, jpss, satel...
59685    [championing, a, climate, change, for, commerc...
Name: processed_title, Length: 59512, dtype: object

In [None]:
context_token

In [41]:
# init stop word
stop_words = set(nltk.corpus.stopwords.words("english"))

In [42]:
# execpt stop word
tokensWSW = [word for word in tokens if word not in stop_words]

In [None]:
tokensWSW

In [44]:
# convert words to base form 
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_list = []
for word in tokensWSW:
    lemmatized_list.append(wordnet_lemmatizer.lemmatize(word))

In [None]:
lemmatized_list