### 기본 세팅

In [1]:
import re
from konlpy.tag import Okt
from hanspell import spell_checker
from soynlp.normalizer import *

import pandas as pd

In [2]:
# 종목 vcoab은 종목명 유의어까지 포함
vocab = [
    ['삼성전자', '삼전'],
    ['SK하이닉스', '하닉'],
    ['카카오뱅크', '카뱅'],
    ['두산중공업', '두중'],
    ['HMM', '흠'],
    ['SK바이오사이언스', 'sk바이오', 'sk바사'],
    ['한국전력공사', '한국전력', '한전'],
    ['카카오'],
    ['대한항공', '댄공']
]

# 통합할 날짜 입력
#date = ''

### 전처리 클래스

In [3]:
class Denoiser:
    def __init__(self, **kwargs):
        super(Denoiser, self).__init__(**kwargs)
    
    
    
    # 공백 제거
    def remove_space(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'^\s+', '', text)
        text = re.sub(r'\s+$', '', text)
        text = re.sub(r'\t', ' ', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\r', ' ', text)
        return text
    
    # 따옴표 제거
    def remove_quote(self, text):
        text = re.sub(r'\'', '', text)
        text = re.sub(r'\`', '', text)
        text = re.sub(r'\‘', '', text)
        text = re.sub(r'\’', '', text)

        text = re.sub(r'\"', '', text)
        text = re.sub(r'\“', '', text)
        text = re.sub(r'\”', '', text)
        return text
    
    # () 괄호 안 제거 
    def remove_between_round_brackets(self, text):
        return re.sub(r'\([^)]*\)', '', text)          
    
    # {} 괄호 안 제거 
    def remove_between_curly_brackets(self, text):
        return re.sub(r'\{[^}]*\}', '', text)
        
    # [] 괄호 안 제거 
    def remove_between_square_brackets(self, text):
        return re.sub(r'\[[^]]*\]', '', text)  

    # <> 괄호 안 제거 
    def remove_between_angle_brackets(self, text):
        return re.sub(r'\<[^>]*\>', '', text)

    # url 제거
    def remove_url(self, text):
        text = re.sub(r'http[s]?://(?:[\t\n\r\f\v]|[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) 
        text = re.sub(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text) 
        return text
    
    # 이메일 제거
    def remove_email(self, text):
        return re.sub(r'[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$','', text)        
    
    # 날짜 제거
    def remove_date(self, text):
        return re.sub(r'\d+[.]\d+[.]\d+','', text)

    # 숫자 이용 문자 제거
    def remove_number_text(self, text):
        text = re.sub(r'[0-9]+%', '', text)
        text = re.sub(r'[0-9]+층', '', text)
        return text
    
    # 특수문자 제거    
    def remove_symbol(self, text):
        return re.sub(r'[^\w\s]', '', text)
    
    # 자음, 모음 제거
    def remove_consonants_vowels(self, text):
        text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]+', '', text)
        return text
    
    # 클린봇 댓글 제거
    def remove_cleanbot_comment(self, text):
        if ('클린봇' in text) or ('가려진 댓글' in text):
            return ''
        else:
            return text
        
    # 종목 코드 제거
    def remove_stock_code(self, text):
        return re.sub(r'[0-9]{6}', '', text)
        
    # 디시인사이드 앱정보 제거
    def remove_dcinside_app(self, text):
        text = re.sub(r'dc App', '', text)
        text = re.sub(r'dc official App', '', text)
        return text
        
    # 일본어, 한자 제거
    def remove_japanese_chinese(self, text):
        text = re.sub(r'[ぁ-ゔ]+|[ァ-ヴー]+[々〆〤]', '', text)
        text = re.sub(r'[一-龥]+', '', text)
        return text
    
    
    
    # 소문자화
    def transform_lower(self, text):
        return text.lower()
    
    # 정규화
    def transform_normalize(self, text):
        return Okt().normalize(text)
          
    # 반복 글자 정제
    def transform_repeat_text(self, text):
        return repeat_normalize(text, num_repeats=1)
    
    
    
    # 내용 전처리
    def denoise(self, text):       
        text = str(text)
        if len(text) != 0:
            text = self.remove_space(text)
            text = self.remove_quote(text)
            text = self.remove_between_round_brackets(text)
            text = self.remove_between_curly_brackets(text)
            text = self.remove_between_square_brackets(text)
            text = self.remove_between_angle_brackets(text)
            text = self.remove_url(text)
            text = self.remove_email(text)
            text = self.remove_date(text)
            text = self.remove_number_text(text)
            text = self.remove_symbol(text)
            text = self.remove_consonants_vowels(text)
            text = self.remove_cleanbot_comment(text)
            text = self.remove_stock_code(text)
            text = self.remove_dcinside_app(text)
            text = self.remove_japanese_chinese(text)

            text = self.transform_lower(text)
            text = self.transform_normalize(text)
            text = self.transform_repeat_text(text)
        return text

### 날짜별 종목 파일 통합

In [4]:
# 다음뉴스 통합
for i in range(len(vocab)):
    # 종목 선택(기본 이름)
    origin_keyword = vocab[i][0]

    for j in range(len(vocab[i])):
        # 종목 선택(유의어)
        keyword = vocab[i][j]

        # 데이터프레임 불러오기
        info = pd.read_csv(f'./{date}_data/{date}_daumnews/{date}_daumnews_{keyword}_info.csv', index_col=0)
        comment = pd.read_csv(f'./{date}_data/{date}_daumnews/{date}_daumnews_{keyword}_comment.csv', index_col=0)

        # 데이터프레임 통합
        if j == 0:
            globals()[f'daumnews_{date}_{keyword}_final_info'] = info
            globals()[f'daumnews_{date}_{keyword}_final_comment'] = comment
        else:
            globals()[f'daumnews_{date}_{origin_keyword}_final_info'] = pd.concat([globals()[f'daumnews_{date}_{origin_keyword}_final_info'], info], ignore_index=True)
            globals()[f'daumnews_{date}_{origin_keyword}_final_comment'] = pd.concat([globals()[f'daumnews_{date}_{origin_keyword}_final_comment'], comment], ignore_index=True)

    # 전처리
    globals()[f'daumnews_{date}_{origin_keyword}_final_info']['제목'] = globals()[f'daumnews_{date}_{origin_keyword}_final_info']['제목'].apply(Denoiser().denoise)
    globals()[f'daumnews_{date}_{origin_keyword}_final_comment']['제목'] = globals()[f'daumnews_{date}_{origin_keyword}_final_comment']['제목'].apply(Denoiser().denoise)
    globals()[f'daumnews_{date}_{origin_keyword}_final_comment']['댓글'] = globals()[f'daumnews_{date}_{origin_keyword}_final_comment']['댓글'].apply(Denoiser().denoise)

    # 통합 파일 저장
    globals()[f'daumnews_{date}_{origin_keyword}_final_info'].to_csv(f'./{date}_data/{date}_daumnews/daumnews_{date}_{origin_keyword}_final_info.csv')
    globals()[f'daumnews_{date}_{origin_keyword}_final_comment'].to_csv(f'./{date}_data/{date}_daumnews/daumnews_{date}_{origin_keyword}_final_comment.csv')

In [5]:
# 디시인사이드 통합
for i in range(len(vocab)):
    # 종목 선택(기본 이름)
    origin_keyword = vocab[i][0]

    for j in range(len(vocab[i])):
        # 종목 선택(유의어)
        keyword = vocab[i][j]

        # 데이터프레임 불러오기
        info = pd.read_csv(f'./{date}_data/{date}_dcinside/{date}_dcinside_{keyword}_info.csv', index_col=0)
        comment = pd.read_csv(f'./{date}_data/{date}_dcinside/{date}_dcinside_{keyword}_comment.csv', index_col=0)

        # 데이터프레임 통합
        if j == 0:
            globals()[f'dcinside_{date}_{keyword}_final_info'] = info
            globals()[f'dcinside_{date}_{keyword}_final_comment'] = comment
        else:
            globals()[f'dcinside_{date}_{origin_keyword}_final_info'] = pd.concat([globals()[f'dcinside_{date}_{origin_keyword}_final_info'], info], ignore_index=True)
            globals()[f'dcinside_{date}_{origin_keyword}_final_comment'] = pd.concat([globals()[f'dcinside_{date}_{origin_keyword}_final_comment'], comment], ignore_index=True)

    # 전처리
    globals()[f'dcinside_{date}_{origin_keyword}_final_info']['제목'] = globals()[f'dcinside_{date}_{origin_keyword}_final_info']['제목'].apply(Denoiser().denoise)
    globals()[f'dcinside_{date}_{origin_keyword}_final_info']['본문'] = globals()[f'dcinside_{date}_{origin_keyword}_final_info']['본문'].apply(Denoiser().denoise)
    globals()[f'dcinside_{date}_{origin_keyword}_final_comment']['제목'] = globals()[f'dcinside_{date}_{origin_keyword}_final_comment']['제목'].apply(Denoiser().denoise)
    globals()[f'dcinside_{date}_{origin_keyword}_final_comment']['댓글'] = globals()[f'dcinside_{date}_{origin_keyword}_final_comment']['댓글'].apply(Denoiser().denoise)

    # 통합 파일 저장
    globals()[f'dcinside_{date}_{origin_keyword}_final_info'].to_csv(f'./{date}_data/{date}_dcinside/dcinside_{date}_{origin_keyword}_final_info.csv')
    globals()[f'dcinside_{date}_{origin_keyword}_final_comment'].to_csv(f'./{date}_data/{date}_dcinside/dcinside_{date}_{origin_keyword}_final_comment.csv')

In [6]:
# 종토방 통합
for i in range(len(vocab)):
    # 종목 선택(기본 이름)
    origin_keyword = vocab[i][0]

    for j in range(len(vocab[i])):
        # 종목 선택(유의어)
        keyword0 = vocab[i][0]
        keyword1 = vocab[i][j]
        keyword = [keyword0, keyword1]

        # 데이터프레임 불러오기
        info = pd.read_csv(f'./{date}_data/{date}_jongto/{date}_jongto_{keyword0}_{keyword1}_info.csv', index_col=0)
        comment = pd.read_csv(f'./{date}_data/{date}_jongto/{date}_jongto_{keyword0}_{keyword1}_comment.csv', index_col=0)

        # 데이터프레임 통합
        if j == 0:
            globals()[f'jongto_{date}_{keyword0}_final_info'] = info
            globals()[f'jongto_{date}_{keyword0}_final_comment'] = comment
        else:
            globals()[f'jongto_{date}_{origin_keyword}_final_info'] = pd.concat([globals()[f'jongto_{date}_{origin_keyword}_final_info'], info], ignore_index=True)
            globals()[f'jongto_{date}_{origin_keyword}_final_comment'] = pd.concat([globals()[f'jongto_{date}_{origin_keyword}_final_comment'], comment], ignore_index=True)

    # 전처리
    globals()[f'jongto_{date}_{origin_keyword}_final_info']['제목'] = globals()[f'jongto_{date}_{origin_keyword}_final_info']['제목'].apply(Denoiser().denoise)
    globals()[f'jongto_{date}_{origin_keyword}_final_info']['본문'] = globals()[f'jongto_{date}_{origin_keyword}_final_info']['본문'].apply(Denoiser().denoise)
    globals()[f'jongto_{date}_{origin_keyword}_final_comment']['제목'] = globals()[f'jongto_{date}_{origin_keyword}_final_comment']['제목'].apply(Denoiser().denoise)
    globals()[f'jongto_{date}_{origin_keyword}_final_comment']['댓글'] = globals()[f'jongto_{date}_{origin_keyword}_final_comment']['댓글'].apply(Denoiser().denoise)

    # 통합 파일 저장
    globals()[f'jongto_{date}_{origin_keyword}_final_info'].to_csv(f'./{date}_data/{date}_jongto/jongto_{date}_{origin_keyword}_final_info.csv')
    globals()[f'jongto_{date}_{origin_keyword}_final_comment'].to_csv(f'./{date}_data/{date}_jongto/jongto_{date}_{origin_keyword}_final_comment.csv')

In [7]:
# 네이버뉴스 통합
for i in range(len(vocab)):
    # 종목 선택(기본 이름)
    origin_keyword = vocab[i][0]

    for j in range(len(vocab[i])):
        # 종목 선택(유의어)
        keyword = vocab[i][j]

        # 데이터프레임 불러오기
        info = pd.read_csv(f'./{date}_data/{date}_navernews/{date}_navernews_{keyword}_info.csv', index_col=0)
        comment = pd.read_csv(f'./{date}_data/{date}_navernews/{date}_navernews_{keyword}_comment.csv', index_col=0)

        # 데이터프레임 통합
        if j == 0:
            globals()[f'navernews_{date}_{keyword}_final_info'] = info
            globals()[f'navernews_{date}_{keyword}_final_comment'] = comment
        else:
            globals()[f'navernews_{date}_{origin_keyword}_final_info'] = pd.concat([globals()[f'navernews_{date}_{origin_keyword}_final_info'], info], ignore_index=True)
            globals()[f'navernews_{date}_{origin_keyword}_final_comment'] = pd.concat([globals()[f'navernews_{date}_{origin_keyword}_final_comment'], comment], ignore_index=True)

    # 전처리
    globals()[f'navernews_{date}_{origin_keyword}_final_info']['제목'] = globals()[f'navernews_{date}_{origin_keyword}_final_info']['제목'].apply(Denoiser().denoise)
    globals()[f'navernews_{date}_{origin_keyword}_final_comment']['제목'] = globals()[f'navernews_{date}_{origin_keyword}_final_comment']['제목'].apply(Denoiser().denoise)
    globals()[f'navernews_{date}_{origin_keyword}_final_comment']['댓글'] = globals()[f'navernews_{date}_{origin_keyword}_final_comment']['댓글'].apply(Denoiser().denoise)

    # 통합 파일 저장
    globals()[f'navernews_{date}_{origin_keyword}_final_info'].to_csv(f'./{date}_data/{date}_navernews/navernews_{date}_{origin_keyword}_final_info.csv')
    globals()[f'navernews_{date}_{origin_keyword}_final_comment'].to_csv(f'./{date}_data/{date}_navernews/navernews_{date}_{origin_keyword}_final_comment.csv')

In [8]:
# 유튜브 통합
for i in range(len(vocab)):
    # 종목 선택(기본 이름)
    keyword = vocab[i][0]
    if keyword == '한국전력공사':
        keyword = '한국전력'

    # 데이터프레임 불러오기
    info = pd.read_csv(f'./{date}_data/{date}_youtube/{date}_youtube_{keyword}_info.csv', index_col=0)
    comment = pd.read_csv(f'./{date}_data/{date}_youtube/{date}_youtube_{keyword}_comment.csv', index_col=0)

    # 데이터프레임 통합
    globals()[f'youtube_{date}_{keyword}_final_info'] = info
    globals()[f'youtube_{date}_{keyword}_final_comment'] = comment

    # 전처리
    globals()[f'youtube_{date}_{keyword}_final_info']['제목'] = globals()[f'youtube_{date}_{keyword}_final_info']['제목'].apply(Denoiser().denoise)
    globals()[f'youtube_{date}_{keyword}_final_comment']['댓글'] = globals()[f'youtube_{date}_{keyword}_final_comment']['댓글'].apply(Denoiser().denoise)

    # 통합 파일 저장
    globals()[f'youtube_{date}_{keyword}_final_info'].to_csv(f'./{date}_data/{date}_youtube/youtube_{date}_{keyword}_final_info.csv')
    globals()[f'youtube_{date}_{keyword}_final_comment'].to_csv(f'./{date}_data/{date}_youtube/youtube_{date}_{keyword}_final_comment.csv')