# Library

In [1]:
# Ignore the warnings
import warnings
# warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# System related and data input controls
import os

# Auto reload of library
%reload_ext autoreload
%autoreload 2

from preprocessing_text_KK import *

def get_data_from_path(folder_location, folder_name=False, concat_axis='row'):
    # path_folder 하위의 모든 폴더위치와 내부 file 출력
    df = pd.DataFrame()
    print('Getting data from', len(os.listdir(folder_location)), 'folders...')
    for (path, dir, files) in os.walk(folder_location):
#         print(path)
        for file in tqdm(files):
            path_file = os.path.join(path, file)

            ## 데이터 로딩
            if path_file[-4:] == 'xlsx':
                df_sub = pd.read_excel(path_file)
            elif path_file[-3:] == 'csv':
                df_sub = pd.read_csv(path_file)

            ## 키워드 태깅 여부
            if folder_name:
                df_sub['Folder_Name'] = os.path.basename(path)
            
            ## 정리
            if concat_axis == 'col':
                df = pd.concat([df, df_sub], axis=1)
            elif concat_axis == 'row':
                df = pd.concat([df, df_sub], axis=0)
                
    return df

2023-11-21 23:19:39,981	INFO worker.py:1642 -- Started a local Ray instance.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# BigKinds

## Hyperparameters

In [4]:
DELETE_KEYWORD = ['100세', '거주환경']
CATEGORY_BK = ['경제', '사회', '문화', '국제']
CATEGORY_BK_Sub = ['경제>경제일반', '경제>국제경제', '경제>취업_창업',
                   '사회>노동_복지', '사회>사건_사고', '사회>사회일반', '사회>여성', '사회>장애인', '사회>의료_건강',
                   '문화>미술_건축', '문화>요리_여행', '문화>출판',
                   '국제>중국', '국제>유럽_EU', '국제>일본', '국제>미국_북미', '국제>중동_아프리카',
                   '국제>아시아', '국제>중남미', '국제>국제일반', '국제>러시아']
COLNAME_CATEGORY = '일자'
COLNAME_MINING = '제목'
### github에 업로드 되지 않도록 다른 폴더를 지정
# 아래 예시는 내PC 바탕화면 Data 폴더를 지정
# SAVE_LOCATION = r'C:\Users\user\Desktop\Data'    # inu
SAVE_LOCATION = r'C:\Users\KK\Desktop\Data'    # home

## Raw Data

In [3]:
# 데이터로딩
df_news = get_data_from_path(os.path.join(os.getcwd(), 'Data', 'BigKinds'), folder_name=True)
# 전처리
## 중복 처리
df_news.drop_duplicates(subset=['뉴스 식별자', '언론사', COLNAME_MINING], inplace=True, ignore_index=True)
## 불필요 변수 삭제
colname_delete = ['뉴스 식별자', '인물', '위치', '기관', '기고자', '통합 분류2', '통합 분류3', 
                  '사건/사고 분류1', '사건/사고 분류2', '사건/사고 분류3',
                  '키워드', '특성추출(가중치순 상위 50개)', 'URL', '분석제외 여부']
df_news = df_news[[col for col in df_news.columns if col not in colname_delete]]
## 카테고리 필터
category_filter = [each for each in df_news['통합 분류1'].unique() if each.split('>')[0] in CATEGORY_BK]
df_news = df_news[df_news['통합 분류1'].apply(lambda x: x in category_filter)].reset_index().iloc[:,1:]
df_news['Category'] = df_news['통합 분류1'].apply(lambda x: x.split('>')[0])
## 전처리
df_news[COLNAME_MINING+'_Origin'] = df_news[COLNAME_MINING].copy()
df_news[COLNAME_MINING] = df_news[COLNAME_MINING].progress_apply(lambda x: text_preprocessor(x, del_number=False, del_bracket_content=False))
## 결측치 및 빈문자 제거
df_news = df_news[~df_news[COLNAME_MINING].isnull()].reset_index().iloc[:,1:].copy()
df_news = df_news[df_news[COLNAME_MINING].str.len() != 0].reset_index().iloc[:,1:]

# 날짜 변환
## 연도 반영
df_news[COLNAME_CATEGORY+'_Year'] = pd.to_datetime(df_news[COLNAME_CATEGORY].astype(str)).dt.year
## 연도+월 반영
df_news[COLNAME_CATEGORY+'_YearMonth'] = pd.to_datetime(df_news[COLNAME_CATEGORY].astype(str)).dt.strftime('%Y-%m')
## 연도그룹 반영
df_news[COLNAME_CATEGORY+'_Era'] = df_news[COLNAME_CATEGORY].apply(lambda x: '2013 ~ 2017' if str(x)[:4] in ['2013', '2014', '2015', '2016', '2017']
                                                                                            else '2018 ~ 2023')

# 나이대 변수 추가
df_news['Age'] = df_news[COLNAME_MINING].apply(lambda x: 20 if re.search(' 20대', x) != None else
                                                 (30 if re.search(' 30대', x) != None else
                                                 (40 if re.search(' 40대', x) != None else
                                                 (50 if re.search(' 50대', x) != None else
                                                 (60 if re.search(' 60대', x) != None else
                                                 (70 if re.search(' 70대', x) != None else
                                                 (80 if re.search(' 80대', x) != None else
                                                 (90 if re.search(' 90대', x) != None else 0))))))))

# 긍부정 라벨 추가
df_news_sentiment = get_data_from_path(os.path.join(os.getcwd(), 'Data', 'Sentiment'), folder_name=False)
df_news_sentiment = df_news_sentiment.sort_values(by='Unnamed: 0').reset_index().iloc[:,2:]
df_news_sentiment.columns = ['Sentiment']
df_news_sentiment['Sentiment'] = df_news_sentiment.Sentiment.apply(lambda x: 'Positive' if x==2 else 'Negative')
df_news_sentiment['Positive'] = df_news_sentiment.Sentiment.apply(lambda x: 1 if x=='Positive' else 0)
df_news_sentiment['Negative'] = df_news_sentiment.Sentiment.apply(lambda x: -1 if x=='Negative' else 0)
df_news = pd.concat([df_news, df_news_sentiment], axis=1)
## 최대 중복 처리
df_news.drop_duplicates(subset=['언론사', COLNAME_MINING], inplace=True, ignore_index=True)
df_news.drop_duplicates(subset=[COLNAME_MINING], inplace=True, ignore_index=True)

# 저장
df_news.to_csv(os.path.join(SAVE_LOCATION, 'df_news_bigkinds.csv'), index=False, encoding='utf-8-sig')

Getting data from 39 folders...


0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.12s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.04s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:20<00:00,  1.87s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.18s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.17s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:08<00:00,  2.68s/it]
100%|████████████████

Getting data from 51 folders...


100%|█████████████████████████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 290.81it/s]


In [5]:
# 필터링한 결과를 최종적으로 사용
df_news = pd.read_csv(os.path.join(SAVE_LOCATION, 'df_news_bigkinds.csv'))
print('Category: ', df_news.Category.value_counts())
keyword_filter = [each for each in df_news['Folder_Name'].unique() if each not in DELETE_KEYWORD]
df_news = df_news[df_news['Folder_Name'].apply(lambda x: x in keyword_filter)].reset_index().iloc[:,1:]
category_filter = [each for each in df_news['통합 분류1'].unique() if each in CATEGORY_BK_Sub]
df_news = df_news[df_news['통합 분류1'].apply(lambda x: x in category_filter)].reset_index().iloc[:,1:]
print('Category: ', df_news.Category.value_counts())

Category:  Category
사회    143555
경제     46072
국제     35535
문화     21051
Name: count, dtype: int64
Category:  Category
사회    143555
경제     46072
국제     35535
문화     21051
Name: count, dtype: int64


## Word Frequency

In [3]:
# # 연도데이터 기준 전처리
# wf_year_soy, waf_year_soy, \
# wf_year_tf, waf_year_tf, \
# wf_year_kb, waf_year_kb = preprocessing_wordfreq(df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year',
#                                                  num_showkeyword=10, save_local=True, save_name='wordfreq_year')

# # 연도그룹데이터 기준 전처리
# wf_era_soy, waf_era_soy, \
# wf_era_tf, waf_era_tf, \
# wf_era_kb, waf_era_kb = preprocessing_wordfreq(df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era',
#                                                num_showkeyword=10, save_local=True, save_name='wordfreq_era')

# # 연도감성데이터 기준 전처리
# wf_senti_soy, waf_senti_soy, \
# wf_senti_tf, waf_senti_tf, \
# wf_senti_kb, waf_senti_kb = preprocessing_wordfreq(df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', 
#                                                    num_showkeyword=10, save_local=True, save_name='wordfreq_senti')

# # 나이데이터 기준 전처리
# wf_age_soy, waf_age_soy, \
# wf_age_tf, waf_age_tf, \
# wf_age_kb, waf_age_kb = preprocessing_wordfreq(df_news[df_news.Age != 0].reset_index().iloc[:,1:], 
#                                                colname_target=COLNAME_MINING, colname_category='Age',
#                                                num_showkeyword=10, save_local=True, save_name='wordfreq_age')

# # 나이+감성데이터 기준 전처리
# ## 부정
# SENTIMENT = 'Negative'
# wf_age_soy, waf_age_soy, \
# wf_age_tf, waf_age_tf, \
# wf_age_kb, waf_age_kb = preprocessing_wordfreq(df_news[(df_news.Age != 0) & (df_news.Sentiment == SENTIMENT)].reset_index().iloc[:,1:], 
#                                                colname_target=COLNAME_MINING, colname_category='Age',
#                                                num_showkeyword=10, save_local=False, save_name='wordfreq_age')
# wf_age_soy['sentiment'], waf_age_soy['sentiment'] = SENTIMENT, SENTIMENT
# wf_age_tf['sentiment'], waf_age_tf['sentiment'] = SENTIMENT, SENTIMENT
# wf_age_kb['sentiment'], waf_age_kb['sentiment'] = SENTIMENT, SENTIMENT
# ## 긍정
# SENTIMENT = 'Positive'
# wf_temp_soy, waf_temp_soy, \
# wf_temp_tf, waf_temp_tf, \
# wf_temp_kb, waf_temp_kb = preprocessing_wordfreq(df_news[(df_news.Age != 0) & (df_news.Sentiment == SENTIMENT)].reset_index().iloc[:,1:], 
#                                                colname_target=COLNAME_MINING, colname_category='Age',
#                                                num_showkeyword=10, save_local=False, save_name='wordfreq_temp')
# wf_temp_soy['sentiment'], waf_temp_soy['sentiment'] = SENTIMENT, SENTIMENT
# wf_temp_tf['sentiment'], waf_temp_tf['sentiment'] = SENTIMENT, SENTIMENT
# wf_temp_kb['sentiment'], waf_temp_kb['sentiment'] = SENTIMENT, SENTIMENT
# ## 정리
# wf_age_soy = pd.concat([wf_age_soy, wf_temp_soy], axis=0)
# waf_age_soy = pd.concat([waf_age_soy, waf_temp_soy], axis=0)
# wf_age_tf = pd.concat([wf_age_tf, wf_temp_tf], axis=0)
# waf_age_tf = pd.concat([waf_age_tf, waf_temp_tf], axis=0)
# wf_age_kb = pd.concat([wf_age_kb, wf_temp_kb], axis=0)
# waf_age_kb = pd.concat([waf_age_kb, waf_temp_kb], axis=0)
# ## 저장
# wf_age_soy.to_csv(os.path.join(os.getcwd(), 'Data', 'WordFreq', 'wordfreq_agesenti_soynlp.csv'), index=False, encoding='utf-8-sig')
# waf_age_soy.to_csv(os.path.join(os.getcwd(), 'Data', 'WordFreq', 'wordfreq_agesenti_soynlpadj.csv'), index=False, encoding='utf-8-sig')
# wf_age_tf.to_csv(os.path.join(os.getcwd(), 'Data', 'WordFreq', 'wordfreq_agesenti_tfidf.csv'), index=False, encoding='utf-8-sig')
# waf_age_tf.to_csv(os.path.join(os.getcwd(), 'Data', 'WordFreq', 'wordfreq_agesenti_tfidfadj.csv'), index=False, encoding='utf-8-sig')
# wf_age_kb.to_csv(os.path.join(os.getcwd(), 'Data', 'WordFreq', 'wordfreq_agesenti_keybert.csv'), index=False, encoding='utf-8-sig')
# waf_age_kb.to_csv(os.path.join(os.getcwd(), 'Data', 'WordFreq', 'wordfreq_agesenti_keybertadj.csv'), index=False, encoding='utf-8-sig')

In [1]:
# 불러오기
save_name_list=['wordfreq_year_soynlp.csv', 'wordfreq_year_soynlpadj.csv', 
                'wordfreq_year_tfidf.csv', 'wordfreq_year_tfidfadj.csv',
                'wordfreq_year_keybert.csv', 'wordfreq_year_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_year_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_year_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_year_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_year_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
wf_year_keybert = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
waf_year_keybert = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordfreq_era_soynlp.csv', 'wordfreq_era_soynlpadj.csv', 
                'wordfreq_era_tfidf.csv', 'wordfreq_era_tfidfadj.csv',
                'wordfreq_era_keybert.csv', 'wordfreq_era_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_era_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_era_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_era_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_era_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
wf_era_keybert = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
waf_era_keybert = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordfreq_senti_soynlp.csv', 'wordfreq_senti_soynlpadj.csv', 
                'wordfreq_senti_tfidf.csv', 'wordfreq_senti_tfidfadj.csv',
                'wordfreq_senti_keybert.csv', 'wordfreq_senti_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_senti_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_senti_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_senti_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_senti_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
wf_senti_keybert = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
waf_senti_keybert = pd.read_csv(save_name)
## 중복처리
wf_senti_soynlp = wf_senti_soynlp.sort_values(by='score', ascending=False).drop_duplicates(['word'], keep='first')
waf_senti_soynlp = waf_senti_soynlp.sort_values(by='score', ascending=False).drop_duplicates(['word'], keep='first')
wf_senti_tfidf = wf_senti_tfidf.sort_values(by='score', ascending=False).drop_duplicates(['word'], keep='first')
waf_senti_tfidf = waf_senti_tfidf.sort_values(by='score', ascending=False).drop_duplicates(['word'], keep='first')
wf_senti_keybert = wf_senti_keybert.sort_values(by='score', ascending=False).drop_duplicates(['word'], keep='first')
waf_senti_keybert = waf_senti_keybert.sort_values(by='score', ascending=False).drop_duplicates(['word'], keep='first')

# 불러오기
save_name_list=['wordfreq_age_soynlp.csv', 'wordfreq_age_soynlpadj.csv', 
                'wordfreq_age_tfidf.csv', 'wordfreq_age_tfidfadj.csv',
                'wordfreq_age_keybert.csv', 'wordfreq_age_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_age_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_age_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_age_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_age_tfidf = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
# wf_age_keybert = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
# waf_age_keybert = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordfreq_agesenti_soynlp.csv', 'wordfreq_agesenti_soynlpadj.csv', 
                'wordfreq_agesenti_tfidf.csv', 'wordfreq_agesenti_tfidfadj.csv',
                'wordfreq_agesenti_keybert.csv', 'wordfreq_agesenti_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_agesenti_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_agesenti_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_agesenti_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_agesenti_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
wf_agesenti_keybert = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
waf_agesenti_keybert = pd.read_csv(save_name)

NameError: name 'os' is not defined

## Word Correlation

In [19]:
# # 관련성 전처리
# wf_yearcorr_soynlp = preprocessing_wordfreq_to_corr(wf_year_soynlp, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year', num_showkeyword=100, save_name='wordcorr_year_soynlp.csv')
# waf_yearcorr_soynlp = preprocessing_wordfreq_to_corr(waf_year_soynlp, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year', num_showkeyword=100, save_name='wordcorr_year_soynlpadj.csv')
# wf_yearcorr_tfidf = preprocessing_wordfreq_to_corr(wf_year_tfidf, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year', num_showkeyword=100, save_name='wordcorr_year_tfidf.csv')
# waf_yearcorr_tfidf = preprocessing_wordfreq_to_corr(waf_year_tfidf, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year', num_showkeyword=100, save_name='wordcorr_year_tfidfadj.csv')
# wf_yearcorr_keybert = preprocessing_wordfreq_to_corr(wf_year_keybert, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year', num_showkeyword=100, save_name='wordcorr_year_keybert.csv')
# waf_yearcorr_keybert = preprocessing_wordfreq_to_corr(waf_year_keybert, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year', num_showkeyword=100, save_name='wordcorr_year_keybertadj.csv')

# wf_eracorr_soynlp = preprocessing_wordfreq_to_corr(wf_era_soynlp, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordcorr_era_soynlp.csv')
# waf_eracorr_soynlp = preprocessing_wordfreq_to_corr(waf_era_soynlp, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordcorr_era_soynlpadj.csv')
# wf_eracorr_tfidf = preprocessing_wordfreq_to_corr(wf_era_tfidf, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordcorr_era_tfidf.csv')
# waf_eracorr_tfidf = preprocessing_wordfreq_to_corr(waf_era_tfidf, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordcorr_era_tfidfadj.csv')
# wf_eracorr_keybert = preprocessing_wordfreq_to_corr(wf_era_keybert.sort_values(by='score', ascending=False).iloc[:10000,:], 
#                                                     df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordcorr_era_keybert.csv')
# waf_eracorr_keybert = preprocessing_wordfreq_to_corr(waf_era_keybert.sort_values(by='score', ascending=False).iloc[:10000,:], 
#                                                      df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordcorr_era_keybertadj.csv')

# wf_senticorr_soynlp = preprocessing_wordfreq_to_corr(wf_senti_soynlp, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordcorr_senti_soynlp.csv')
# waf_senticorr_soynlp = preprocessing_wordfreq_to_corr(waf_senti_soynlp, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordcorr_senti_soynlpadj.csv')
# wf_senticorr_tfidf = preprocessing_wordfreq_to_corr(wf_senti_tfidf, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordcorr_senti_tfidf.csv')
# waf_senticorr_tfidf = preprocessing_wordfreq_to_corr(waf_senti_tfidf, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordcorr_senti_tfidfadj.csv')
# wf_senticorr_keybert = preprocessing_wordfreq_to_corr(wf_senti_keybert.sort_values(by='score', ascending=False).iloc[:10000,:],
#                                                       df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordcorr_senti_keybert.csv')
# waf_senticorr_keybert = preprocessing_wordfreq_to_corr(waf_senti_keybert.sort_values(by='score', ascending=False).iloc[:10000,:], 
#                                                        df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordcorr_senti_keybertadj.csv')

# wf_agecorr_soynlp = preprocessing_wordfreq_to_corr(wf_age_soynlp, df_news, colname_target=COLNAME_MINING, colname_category='Age', num_showkeyword=100, save_name='wordcorr_age_soynlp.csv')
# waf_agecorr_soynlp = preprocessing_wordfreq_to_corr(waf_age_soynlp, df_news, colname_target=COLNAME_MINING, colname_category='Age', num_showkeyword=100, save_name='wordcorr_age_soynlpadj.csv')
# wf_agecorr_tfidf = preprocessing_wordfreq_to_corr(wf_age_tfidf, df_news, colname_target=COLNAME_MINING, colname_category='Age', num_showkeyword=100, save_name='wordcorr_age_tfidf.csv')
# waf_agecorr_tfidf = preprocessing_wordfreq_to_corr(waf_age_tfidf, df_news, colname_target=COLNAME_MINING, colname_category='Age', num_showkeyword=100, save_name='wordcorr_age_tfidfadj.csv')

In [23]:
# 불러오기
save_name_list=['wordcorr_year_soynlp.csv', 'wordcorr_year_soynlpadj.csv', 
                'wordcorr_year_tfidf.csv', 'wordcorr_year_tfidfadj.csv',
                'wordcorr_year_keybert.csv', 'wordcorr_year_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[0])
wf_yearcorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[1])
waf_yearcorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[2])
wf_yearcorr_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[3])
waf_yearcorr_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[4])
wf_yearcorr_keybert = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[5])
waf_yearcorr_keybert = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordcorr_era_soynlp.csv', 'wordcorr_era_soynlpadj.csv', 
                'wordcorr_era_tfidf.csv', 'wordcorr_era_tfidfadj.csv',
                'wordcorr_era_keybert.csv', 'wordcorr_era_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[0])
wf_eracorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[1])
waf_eracorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[2])
wf_eracorr_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[3])
waf_eracorr_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[4])
wf_eracorr_keybert = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[5])
waf_eracorr_keybert = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordcorr_senti_soynlp.csv', 'wordcorr_senti_soynlpadj.csv', 
                'wordcorr_senti_tfidf.csv', 'wordcorr_senti_tfidfadj.csv',
                'wordcorr_senti_keybert.csv', 'wordcorr_senti_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[0])
wf_senticorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[1])
waf_senticorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[2])
wf_senticorr_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[3])
waf_senticorr_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[4])
wf_senticorr_keybert = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[5])
waf_senticorr_keybert = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordcorr_age_soynlp.csv', 'wordcorr_age_soynlpadj.csv', 
                'wordcorr_age_tfidf.csv', 'wordcorr_age_tfidfadj.csv',
                'wordcorr_age_keybert.csv', 'wordcorr_age_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[0])
wf_agecorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[1])
waf_agecorr_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[2])
wf_agecorr_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[3])
waf_agecorr_tfidf = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[4])
# wf_agecorr_keybert = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', 'WordCorr', save_name_list[5])
# waf_agecorr_keybert = pd.read_csv(save_name)

# Naver Crawling

## Hyperparameters

In [None]:
CATEGORY_CR = ['세계', '경제', '생활/문화', '오피니언', '사회', 'IT/과학']
### github에 업로드 되지 않도록 다른 폴더를 지정
# 아래 예시는 내PC 바탕화면 Data 폴더를 지정
# SAVE_LOCATION = r'C:\Users\user\Desktop\Data'    # inu
SAVE_LOCATION = r'C:\Users\KK\Desktop\Data'    # home

## Raw Data

In [28]:
# 데이터로딩
df_news = get_data_from_path(os.path.join(os.getcwd(), 'Data', 'NaverNews'), folder_name=True)

# 필터링
## 중복 처리
df_news.drop_duplicates(subset=['Press', 'Title'], inplace=True, ignore_index=True)
## 불필요 변수 삭제
colname_delete = ['Content', 'URL_Origin']
df_news = df_news[[col for col in df_news.columns if col not in colname_delete]]
## 카테고리 필터
category_filter = [each for each in df_news.Category.unique() if each in CATEGORY_CR]
df_news = df_news[df_news.Category.apply(lambda x: x in category_filter)].reset_index().iloc[:,1:]
## 언론사 중복 필터
df_news.Press = df_news.Press.progress_apply(lambda x: str(x).split('언론사 선정')[0])

# 날짜 변환
df_news.Date = pd.to_datetime(df_news.Date)
## 연도 반영
df_news['Date_Year'] = pd.to_datetime(df_news.Date.astype(str)).dt.year
## 연도+월 반영
df_news['Date_YearMonth'] = pd.to_datetime(df_news.Date.astype(str)).dt.strftime('%Y-%m')
## 연도그룹 반영
df_news['Date_Period'] = df_news.Date_Year.apply(lambda x: '2013 ~ 2017' if str(x)[:4] in ['2013', '2014', '2015', '2016', '2017']
                                                                          else '2018 ~ 2023')
df_news = df_news[['Folder_Name', 'Date', 'Date_Year', 'Date_YearMonth', 'Date_Period', 'Press', 'Category', 'Title', 'Comment', 'URL_Naver']]

# 전처리
df_news['Title'] = df_news['Title'].progress_apply(lambda x: text_preprocessor(x, del_number=False, 
                                                                               del_bracket_content=False))
df_news = df_news[~df_news['Title'].isnull()].reset_index().iloc[:,1:].copy()
df_news = df_news[df_news['Title'].str.len() != 0].reset_index().iloc[:,1:]
## 댓글 길이가 5이상 & 갯수가 5개 이상 필터
df_news['Comment'] = df_news['Comment'].progress_apply(lambda x: [i for i in literal_eval(x) if len(i) >= 5])
df_news = df_news[df_news['Comment'].progress_apply(lambda x: len(x) >= 5)]

# 언론사 필터
## 댓글 평균이 5이상 필터 & 발행기사수 Top100 필터
df_news['Comment_Len'] = df_news.Comment.apply(lambda x: len(x))
df_temp = df_news.groupby('Press')['Comment_Len'].mean()
del_press = list(pd.DataFrame(df_temp[df_temp < 5]).index)
del_press = del_press + list(dict(df_news.Press.value_counts()).keys())[100:]
df_news = df_news[~df_news.Press.isin(del_press)].reset_index().iloc[:,1:]
df_news = df_news.drop('Comment_Len', axis=1)

# 댓글기준 explode
df_news_explode = df_news.copy()
df_news_explode = df_news_explode.explode('Comment')

# 저장
df_news.to_csv(os.path.join(SAVE_LOCATION, 'df_news_crawling.csv'), index=False, encoding='utf-8-sig')
df_news_explode.to_csv(os.path.join(SAVE_LOCATION, 'df_news_explode_crawling.csv'), index=False, encoding='utf-8-sig')

In [6]:
## 불러오기
df_news_nv = pd.read_csv(os.path.join(SAVE_LOCATION, 'df_news_crawling.csv'))
print('Category: ', df_news_nv.Category.value_counts())
df_newse_nv = pd.read_csv(os.path.join(SAVE_LOCATION, 'df_news_explode_crawling.csv'))

Category:  Category
사회       10472
경제        5055
생활/문화     3360
세계        1837
오피니언      1007
IT/과학      659
Name: count, dtype: int64


In [7]:
# Temp

In [9]:
temp = df_newse_nv[df_newse_nv.Press.isin(['매일경제', '헤럴드경제', 'KBS', '파이낸셜뉴스', '한국경제', '중앙일보', '한국일보', '서울경제',
       '세계일보', '아시아경제', '전자신문', 'OBS', '서울신문', '머니투데이', 'YTN', '아주경제',
       '조선일보', '문화일보', '내일신문', '디지털타임스', '경향신문', '국민일보', '동아일보', 'MBC',
       '한겨레', 'SBS', '오마이뉴스'])].reset_index().iloc[:,1:]

In [10]:
temp[['Date', 'Press', 'Folder_Name', 'Category', 'Title', 'Comment']][temp.Date >= '2020'].reset_index().iloc[:,1:].to_csv(os.path.join(SAVE_LOCATION, 'df_news_explode_crawling_shared.csv'), index=False, encoding='utf-8-sig')