In [73]:
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import pickle
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [74]:
total_data = pd.DataFrame()

# 6개로 나눈 피클파일을 불러와서 데이터프레임으로 합치기
for i in range(0,6):
    with open("abstract_%s.p" %i,"rb") as f:
        total_data = pd.concat([total_data,pickle.load(f)])

In [75]:
# column 순서 변경
total_data = total_data[['ISSN','date','topic','title','abstract']]
total_data.shape

(49712, 5)

In [76]:
# 인덱스 재설정
total_data.index = range(len(total_data.index))

In [77]:
# 초록 없는 데이터 지우기
total_data = total_data.drop(total_data[total_data.abstract == ""].index)
total_data = total_data.dropna(axis=0)

In [78]:
total_data.shape

(44939, 5)

In [79]:
# 인덱스 재설정
total_data.index = range(len(total_data.index))

In [80]:
def abstract_to_words( raw_abstract ):
    # 1. HTML 제거
    abstract_text = bs(raw_abstract, 'html.parser').get_text()
    # 2. 영문자가 아닌 문자는 공백으로 변환
    letters_only = re.sub('[^a-zA-Z]', ' ', abstract_text)
    # 3. 소문자 변환
    words = letters_only.lower().split()
    # 4. 파이썬에서는 리스트보다 세트로 찾는게 훨씬 빠르다.
    # stopwords 를 세트로 변환한다.
    stops = set(stopwords.words('english'))
    # 5. Stopwords 불용어 제거
    meaningful_words = [w for w in words if not w in stops]
    # 6. 어간추출
    stemmer = SnowballStemmer('english')
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. 공백으로 구분된 문자열로 결합하여 결과를 반환
    return( ' '.join(stemming_words) )

In [81]:
from multiprocessing import Pool
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    # 키워드 항목 중 workers 파라메터를 꺼냄
    workers = kwargs.pop('workers')
    # 위에서 가져온 workers 수로 프로세스 풀을 정의
    pool = Pool(processes=workers)
    # 실행할 함수와 데이터프레임을 워커의 수 만큼 나눠 작업
    result = pool.map(_apply_df, [(d, func, kwargs)
            for d in np.array_split(df, workers)])
    pool.close()
    # 작업 결과를 합쳐서 반환
    return pd.concat(list(result))

In [82]:
%time clean_train_abstract = apply_by_multiprocessing(\
    total_data['abstract'], abstract_to_words, workers=32)

CPU times: user 292 ms, sys: 327 ms, total: 620 ms
Wall time: 18.2 s


In [83]:
clean_train_abstract.tail(10)

44929    abstract curv space difficult perceiv analyz e...
44930    abstract paper describ obliq high level fast t...
44931    abstract paper investig visual geometr algorit...
44932    cover volumetr ray trace silicon crystal super...
44933    abstract build principl prior work procedur te...
44934    abstract network critic modern societi thoroug...
44935    abstract paper propos scheme perform volum ren...
44936    abstract paper describ new method visual analy...
44937    abstract hemicub estim form factor base finit ...
44938    abstract far problem global illumin calcul alm...
Name: abstract, dtype: object

In [84]:
# column 이름 변경
total_data.rename(columns={'abstract':'raw_abstract'}, inplace=True)

In [85]:
# column 추가
total_data['clean_train_abstract'] = clean_train_abstract

In [87]:
# pickle로 내보내기
total_data.to_pickle('clean_train_abstract')