In [1]:
import pandas as pd
import re
from datetime import datetime
from soyspacing.countbase import CountSpace

In [2]:
def remove_patterns(text, patterns: list):
    
    assert type(text) is pd.core.series.Series
    assert type(patterns) is list
    
    for pattern in patterns:
        text = text.str.replace(pattern, " ")
    
    return text


def preprocess_lv1(text):
    
    assert type(text) is pd.core.series.Series
    
    text = text.str.lower()
    
    patterns = {
        "[\s]": " ",
        "[.]{2,}": " ",
        #",": "",
        #".": "",
        "아쉬운점\s*\d*": " ",
        "좋은점\s*\d*": " ",
        "글자수\s*\d*자\d*byte": " ",
        "o{2,}": " "
    }
    
    for pattern in patterns:
        text = text.apply(lambda x: re.sub(pattern, patterns[pattern], x))
    
    return text


def preprocess_lv2(text, only_hangul=False):
    
    assert type(text) is pd.core.series.Series
    
    pattern = "[^ㄱ-ㅎㅏ-ㅣ가-힣]" if only_hangul else "[^ㄱ-ㅎㅏ-ㅣ가-힣a-z0-9]"
    
    text = text.apply(lambda x: re.sub(pattern, " ", x))
    text = text.apply(lambda x: re.sub("\s+", " ", x))
    
    return text


def correct_spacing(train, test, file_path=None):
    
    for series in [train, test]:
        assert type(series) is pd.core.series.Series
        
    if file_path is not None:
        assert file_path[-4:] == ".txt"
    else:
        file_path = "correct_spacing_train_" + datetime.strftime(datetime.today(), "%Y%m%d") + ".txt"
        
    train.to_csv(file_path, index=None, header=None)
    
    model = CountSpace()
    model.train(file_path)
    
    train = train.apply(lambda x: model.correct(x)[0])
    test = test.apply(lambda x: model.correct(x)[0])
    
    return train, test


In [3]:
data = pd.read_csv("jobkorea_all.csv")
data.columns

Index(['질문', '답변', '조언', '스펙', '평가', '총평', '주소', '회사명', '지원시기', '근무형태',
       '직무분야'],
      dtype='object')

In [33]:
test = preprocess_lv1(data["답변"].dropna()) # 답변, 조언
test = preprocess_lv2(test)

In [46]:
test2 = data["스펙"].str.split("\n") # 스펙
test2 = test2.apply(lambda x: x[1:-2])

In [52]:
test2[test2.apply(lambda x: len(x)).idxmax()]

['지방4년',
 '국제무역학과',
 '학점 3.97/4.5',
 '토익 960',
 '토스 Level7',
 '오픽 IH',
 '자격증 3개',
 '해외경험 3회',
 '인턴 1회',
 '수상 1회',
 '동아리 1회',
 '교내활동 1회',
 '사회활동 1회',
 '자원봉사 1회']

In [51]:
test2[test2.apply(lambda x: len(x)).idxmin()]

['고졸']