# Library

In [9]:
# Ignore the warnings
import warnings
# warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

from preprocessing_text_KK import *

def get_data_from_path(folder_location, folder_name=False, concat_axis='row'):
    # path_folder 하위의 모든 폴더위치와 내부 file 출력
    df = pd.DataFrame()
    print('Getting data from', len(os.listdir(folder_location)), 'folders...')
    for (path, dir, files) in os.walk(folder_location):
#         print(path)
        for file in tqdm(files):
            path_file = os.path.join(path, file)

            ## 데이터 로딩
            if path_file[-4:] == 'xlsx':
                df_sub = pd.read_excel(path_file)
            elif path_file[-3:] == 'csv':
                df_sub = pd.read_csv(path_file)

            ## 키워드 태깅 여부
            if folder_name:
                df_sub['Folder_Name'] = os.path.basename(path)
            
            ## 정리
            if concat_axis == 'col':
                df = pd.concat([df, df_sub], axis=1)
            elif concat_axis == 'row':
                df = pd.concat([df, df_sub], axis=0)
                
    return df

# 하이퍼파라미터
DELETE_KEYWORD = ['100세', '거주환경']
CATEGORY_BK = ['경제', '사회', '문화', '국제']
CATEGORY_BK_Sub = ['경제>경제일반', '경제>국제경제', '경제>취업_창업',
                   '사회>노동_복지', '사회>사건_사고', '사회>사회일반', '사회>여성', '사회>장애인', '사회>의료_건강',
                   '문화>미술_건축', '문화>요리_여행', '문화>출판',
                   '국제>중국', '국제>유럽_EU', '국제>일본', '국제>미국_북미', '국제>중동_아프리카',
                   '국제>아시아', '국제>중남미', '국제>국제일반', '국제>러시아']
CATEGORY_CR = ['세계', '경제', '생활/문화', '오피니언', '사회', 'IT/과학']
COLNAME_CATEGORY = '일자'
COLNAME_MINING = '제목'
### github에 업로드 되지 않도록 다른 폴더를 지정
# 아래 예시는 내PC 바탕화면 Data 폴더를 지정
SAVE_LOCATION = r'C:\Users\KK\Desktop\Data'    # home

# BigKinds

## Raw Data

In [11]:
# 데이터로딩
df_news = get_data_from_path(os.path.join(os.getcwd(), 'Data', 'BigKinds'), folder_name=True)
# 전처리
## 중복 처리
df_news.drop_duplicates(subset=['뉴스 식별자', '언론사', COLNAME_MINING], inplace=True, ignore_index=True)
## 불필요 변수 삭제
colname_delete = ['뉴스 식별자', '인물', '위치', '기관', '기고자', '통합 분류2', '통합 분류3', 
                  '사건/사고 분류1', '사건/사고 분류2', '사건/사고 분류3',
                  '키워드', '특성추출(가중치순 상위 50개)', 'URL', '분석제외 여부']
df_news = df_news[[col for col in df_news.columns if col not in colname_delete]]
## 카테고리 필터
category_filter = [each for each in df_news['통합 분류1'].unique() if each.split('>')[0] in CATEGORY_BK]
df_news = df_news[df_news['통합 분류1'].apply(lambda x: x in category_filter)].reset_index().iloc[:,1:]
df_news['Category'] = df_news['통합 분류1'].apply(lambda x: x.split('>')[0])
## 전처리
df_news[COLNAME_MINING] = df_news[COLNAME_MINING].progress_apply(lambda x: text_preprocessor(x, del_number=False, del_bracket_content=False))
## 결측치 및 빈문자 제거
df_news = df_news[~df_news[COLNAME_MINING].isnull()].reset_index().iloc[:,1:].copy()
df_news = df_news[df_news[COLNAME_MINING].str.len() != 0].reset_index().iloc[:,1:]

# 날짜 변환
## 연도 반영
df_news[COLNAME_CATEGORY+'_Year'] = pd.to_datetime(df_news[COLNAME_CATEGORY].astype(str)).dt.year
## 연도+월 반영
df_news[COLNAME_CATEGORY+'_YearMonth'] = pd.to_datetime(df_news[COLNAME_CATEGORY].astype(str)).dt.strftime('%Y-%m')
## 연도그룹 반영
df_news[COLNAME_CATEGORY+'_Era'] = df_news[COLNAME_CATEGORY].apply(lambda x: '2013 ~ 2017' if str(x)[:4] in ['2013', '2014', '2015', '2016', '2017']
                                                                                            else '2018 ~ 2023')

# 나이대 변수 추가
df_news['Age_eval'] = df_news['제목'].apply(lambda x: x if re.search(' 20대', x) != None else
                                                 (x if re.search(' 30대', x) != None else
                                                 (x if re.search(' 40대', x) != None else
                                                 (x if re.search(' 50대', x) != None else
                                                 (x if re.search(' 60대', x) != None else
                                                 (x if re.search(' 70대', x) != None else
                                                 (x if re.search(' 80대', x) != None else
                                                 (x if re.search(' 90대', x) != None else 0))))))))

# 긍부정 라벨 추가
df_news_sentiment = get_data_from_path(os.path.join(os.getcwd(), 'Data', 'Sentiment'), folder_name=False)
df_news_sentiment = df_news_sentiment.sort_values(by='Unnamed: 0').reset_index().iloc[:,2:]
df_news_sentiment.columns = ['Sentiment']
df_news_sentiment['Sentiment'] = df_news_sentiment.Sentiment.apply(lambda x: 'Positive' if x==2 else 'Negative')
df_news_sentiment['Positive'] = df_news_sentiment.Sentiment.apply(lambda x: 1 if x=='Positive' else 0)
df_news_sentiment['Negative'] = df_news_sentiment.Sentiment.apply(lambda x: -1 if x=='Negative' else 0)
df_news = pd.concat([df_news, df_news_sentiment], axis=1)
## 최대 중복 처리
df_news.drop_duplicates(subset=['언론사', COLNAME_MINING], inplace=True, ignore_index=True)
df_news.drop_duplicates(subset=[COLNAME_MINING], inplace=True, ignore_index=True)

# 저장
df_news.to_csv(os.path.join(SAVE_LOCATION, 'df_news_bigkinds.csv'), index=False, encoding='utf-8-sig')

In [12]:
# 필터링한 결과를 최종적으로 사용
df_news = pd.read_csv(os.path.join(SAVE_LOCATION, 'df_news_bigkinds.csv'))
print('Category: ', df_news.Category.value_counts())
keyword_filter = [each for each in df_news['Folder_Name'].unique() if each not in DELETE_KEYWORD]
df_news = df_news[df_news['Folder_Name'].apply(lambda x: x in keyword_filter)].reset_index().iloc[:,1:]
category_filter = [each for each in df_news['통합 분류1'].unique() if each in CATEGORY_BK_Sub]
df_news = df_news[df_news['통합 분류1'].apply(lambda x: x in category_filter)].reset_index().iloc[:,1:]
print('Category: ', df_news.Category.value_counts())

Category:  Category
경제    182058
사회    175598
문화     96737
국제     37249
Name: count, dtype: int64
Category:  Category
사회    143555
경제     46072
국제     35535
문화     21051
Name: count, dtype: int64


## Keyword Frequency

In [None]:
# # 연도데이터 기준 전처리
# wf_year_soy, waf_year_soy, \
# wf_year_tf, waf_year_tf, \
# wf_year_kb, waf_year_kb = preprocessing_wordfreq(df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Year',
#                                                  num_showkeyword=10, save_local=True, save_name='wordfreq_year')

# # 연도그룹데이터 기준 전처리
# wf_era_soy, waf_era_soy, \
# wf_era_tf, waf_era_tf, \
# wf_era_kb, waf_era_kb = preprocessing_wordfreq(df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era',
#                                                num_showkeyword=10, save_local=True, save_name='wordfreq_era')
# ## 안되면 아래줄 실행
# if wf_era_tf.shape[0] == 0:
#     wf_era_tf = wf_tf.copy()
#     wf_era_tf.category.apply(lambda x: '2013 ~ 2017' if x in ['2013', '2014', '2015', '2016', '2017']
#                                                       else '2018 ~ 2023')
#     wf_era_tf = wf_era_tf.groupby(list(wf_era_tf.columns[:2])).mean().reset_index()
#     waf_era_tf = pd.DataFrame()
#     for category in tqdm(sorted(df_news[COLNAME_CATEGORY+'_Era'].unique())):
#         df_sub = df_news[df_news[COLNAME_CATEGORY+'_Era'] == category]
#         waf_era = preprocessing_adjwordcount(wf_era_tf[['word']], df_sub[COLNAME_MINING], num_showkeyword=5)
#         waf_era['category'] = str(category)
#         waf_era = waf_era[['category']+list(waf_era.columns[:-1])]
#         waf_era_tf = pd.concat([waf_era_tf, waf_era], axis=0, ignore_index=True)
#     save_name = os.path.join(os.getcwd(), 'Data', 'word_freq_tfidf_era.csv')
#     wf_era_tf.to_csv(save_name, index=False, encoding='utf-8-sig')
#     save_name = os.path.join(os.getcwd(), 'Data', 'wordadj_freq_tfidf_era.csv')
#     waf_era_tf.to_csv(save_name, index=False, encoding='utf-8-sig')
# ######################

# # 연도감성데이터 기준 전처리
# wf_senti_soy, waf_senti_soy, \
# wf_senti_tf, waf_senti_tf, \
# wf_senti_kb, waf_senti_kb = preprocessing_wordfreq(df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', 
#                                                    num_showkeyword=10, save_local=True, save_name='wordfreq_senti')

In [13]:
# 불러오기
save_name_list=['wordfreq_year_soynlp.csv', 'wordfreq_year_soynlpadj.csv', 
                'wordfreq_year_tfidf.csv', 'wordfreq_year_tfidfadj.csv',
                'wordfreq_year_keybert.csv', 'wordfreq_year_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_year_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_year_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_year_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_year_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
wf_year_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
waf_year_tfidf = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordfreq_era_soynlp.csv', 'wordfreq_era_soynlpadj.csv', 
                'wordfreq_era_tfidf.csv', 'wordfreq_era_tfidfadj.csv',
                'wordfreq_era_keybert.csv', 'wordfreq_era_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_era_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_era_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_era_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_era_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
wf_era_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
waf_era_tfidf = pd.read_csv(save_name)

# 불러오기
save_name_list=['wordfreq_senti_soynlp.csv', 'wordfreq_senti_soynlpadj.csv', 
                'wordfreq_senti_tfidf.csv', 'wordfreq_senti_tfidfadj.csv',
                'wordfreq_senti_keybert.csv', 'wordfreq_senti_keybertadj.csv']
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[0])
wf_senti_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[1])
waf_senti_soynlp = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[2])
wf_senti_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[3])
waf_senti_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[4])
wf_senti_tfidf = pd.read_csv(save_name)
save_name = os.path.join(os.getcwd(), 'Data', 'WordFreq', save_name_list[5])
waf_senti_tfidf = pd.read_csv(save_name)

## Correlation of Keyword Frequency

In [None]:
# # 관련성 전처리
# wf_corr_soynlp = preprocessing_wordfreq_to_corr(wf_era_soynlp, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='word_corrpair_soynlp_era.csv')
# waf_corr_soynlp = preprocessing_wordfreq_to_corr(waf_era_soynlp, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordadj_corrpair_soynlp_era.csv')
# wf_corr_tfidf = preprocessing_wordfreq_to_corr(wf_era_tfidf, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='word_corrpair_tfidf_era.csv')
# waf_corr_tfidf = preprocessing_wordfreq_to_corr(waf_era_tfidf, df_news, colname_target=COLNAME_MINING, colname_category=COLNAME_CATEGORY+'_Era', num_showkeyword=100, save_name='wordadj_corrpair_tfidf_era.csv')

# wf_senti_corr_soynlp = preprocessing_wordfreq_to_corr(wf_senti_soynlp, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='word_corrpair_soynlp_senti.csv')
# waf_senti_corr_soynlp = preprocessing_wordfreq_to_corr(waf_senti_soynlp, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordadj_corrpair_soynlp_senti.csv')
# # wf_senti_corr_tfidf = preprocessing_wordfreq_to_corr(wf_senti_tfidf, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='word_corrpair_tfidf_senti.csv')
# # waf_senti_corr_tfidf = preprocessing_wordfreq_to_corr(waf_senti_tfidf, df_news, colname_target=COLNAME_MINING, colname_category='Sentiment', num_showkeyword=100, save_name='wordadj_corrpair_tfidf_senti.csv')


In [None]:
# # 불러오기
# save_name_list=['word_corrpair_soynlp_era.csv', 'wordadj_corrpair_soynlp_era.csv', 
#                 'word_corrpair_tfidf_era.csv', 'wordadj_corrpair_tfidf_era.csv']
# save_name = os.path.join(os.getcwd(), 'Data', save_name_list[0])
# wf_corr_soynlp = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', save_name_list[1])
# waf_corr_soynlp = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', save_name_list[2])
# wf_corr_tfidf = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', save_name_list[3])
# waf_corr_tfidf = pd.read_csv(save_name)

# save_name_list=['word_corrpair_soynlp_senti.csv', 'wordadj_corrpair_soynlp_senti.csv', 
#                 'word_corrpair_tfidf_senti.csv', 'wordadj_corrpair_tfidf_senti.csv']
# save_name = os.path.join(os.getcwd(), 'Data', save_name_list[0])
# wf_senti_corr_soynlp = pd.read_csv(save_name)
# save_name = os.path.join(os.getcwd(), 'Data', save_name_list[1])
# waf_senti_corr_soynlp = pd.read_csv(save_name)
# # save_name = os.path.join(os.getcwd(), 'Data', save_name_list[2])
# # wf_senti_corr_tfidf = pd.read_csv(save_name)
# # save_name = os.path.join(os.getcwd(), 'Data', save_name_list[3])
# # waf_senti_corr_tfidf = pd.read_csv(save_name)

# Naver Crawling

## Raw Data

In [None]:
# 데이터로딩
df_news = get_data_from_path(os.path.join(os.getcwd(), 'Data', 'NaverNews'), folder_name=True)

# 필터링
## 중복 처리
df_news.drop_duplicates(subset=['Press', 'Title'], inplace=True, ignore_index=True)
## 불필요 변수 삭제
colname_delete = ['Content', 'URL_Origin']
df_news = df_news[[col for col in df_news.columns if col not in colname_delete]]
## 카테고리 필터
category_filter = [each for each in df_news.Category.unique() if each in CATEGORY_CR]
df_news = df_news[df_news.Category.apply(lambda x: x in category_filter)].reset_index().iloc[:,1:]
## 언론사 중복 필터
df_news.Press = df_news.Press.progress_apply(lambda x: str(x).split('언론사 선정')[0])

# 날짜 변환
df_news.Date = pd.to_datetime(df_news.Date)
## 연도 반영
df_news['Date_Year'] = pd.to_datetime(df_news.Date.astype(str)).dt.year
## 연도+월 반영
df_news['Date_YearMonth'] = pd.to_datetime(df_news.Date.astype(str)).dt.strftime('%Y-%m')
## 연도그룹 반영
df_news['Date_Period'] = df_news.Date_Year.apply(lambda x: '2013 ~ 2017' if str(x)[:4] in ['2013', '2014', '2015', '2016', '2017']
                                                                          else '2018 ~ 2023')
df_news = df_news[['Folder_Name', 'Date', 'Date_Year', 'Date_YearMonth', 'Date_Period', 'Press', 'Category', 'Title', 'Comment', 'URL_Naver']]

# 전처리
df_news['Title'] = df_news['Title'].progress_apply(lambda x: text_preprocessor(x, del_number=False, 
                                                                               del_bracket_content=False))
df_news = df_news[~df_news['Title'].isnull()].reset_index().iloc[:,1:].copy()
df_news = df_news[df_news['Title'].str.len() != 0].reset_index().iloc[:,1:]
## 댓글 길이가 5이상 & 갯수가 5개 이상 필터
df_news['Comment'] = df_news['Comment'].progress_apply(lambda x: [i for i in literal_eval(x) if len(i) >= 5])
df_news = df_news[df_news['Comment'].progress_apply(lambda x: len(x) >= 5)]

# 언론사 필터
## 댓글 평균이 5이상 필터 & 발행기사수 Top100 필터
df_news['Comment_Len'] = df_news.Comment.apply(lambda x: len(x))
df_temp = df_news.groupby('Press')['Comment_Len'].mean()
del_press = list(pd.DataFrame(df_temp[df_temp < 5]).index)
del_press = del_press + list(dict(df_news.Press.value_counts()).keys())[100:]
df_news = df_news[~df_news.Press.isin(del_press)].reset_index().iloc[:,1:]
df_news = df_news.drop('Comment_Len', axis=1)

# 댓글기준 explode
df_news_explode = df_news.copy()
df_news_explode = df_news_explode.explode('Comment')

# 저장
df_news.to_csv(os.path.join(SAVE_LOCATION, 'df_news_crawling.csv'), index=False, encoding='utf-8-sig')
df_news_explode.to_csv(os.path.join(SAVE_LOCATION, 'df_news_explode_crawling.csv'), index=False, encoding='utf-8-sig')

In [None]:
## 불러오기
df_news_nv = pd.read_csv(os.path.join(SAVE_LOCATION, 'df_news_crawling.csv'))
print('Category: ', df_news_nv.Category.value_counts())
df_newse_nv = pd.read_csv(os.path.join(SAVE_LOCATION, 'df_news_explode_crawling.csv'))