In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
import os
from time import time
import pandas as pd
import numpy as np
from mylib.tokenizer import tokenizer, tokenizer_porter
import nltk

github_path = 'C:/Users/taehee/Documents/GitHub/tripReviewAnalysisSystem/'
#engine='python' 을 넣어줘야 판다스가 한국어로 된 폴더명을 인식한다.

In [3]:
def model_tfidf(df, train_size, c_value):
    '''
    입력 : text와 sentiment가 포함된 리뷰들의 데이터 프레임
    출력 : 훈련이 끝난 분류기와 정확도를 리턴.
    '''

    X = df.loc[:, 'text'].values
    y = df.loc[:, 'sentiment'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=1 - train_size,
                                                        random_state=14,
                                                        stratify=y)

    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)
    Ir_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(C=c_value, penalty='l2', random_state=0, solver='liblinear'))])

    Ir_tfidf.fit(X_train, y_train)
    y_pred = Ir_tfidf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print('정확도: %.3f' % test_accuracy)

    return Ir_tfidf, test_accuracy



In [4]:
def all_reviews(github_path, file_list, size):
    result = pd.read_csv(github_path + '크롤러-전처리/원시자료/' + file_list[0], encoding='utf-8', engine='python', index_col=0)
    stime = time()
    print('데이터 로딩 start')

    for file_path in file_list[:size]:
        new_data = pd.read_csv(github_path + '크롤러-전처리/원시자료/' + file_path, encoding='utf-8', engine='python',
                               index_col=0)
        # 새 데이터를 리스트로 합쳐서 concat시키면 행이 계속 추가 된다.
        # ignore_index로 기존 인덱스를  무시하고 합친 결과를 다시 인덱스해준다. 이거 안 하면 index must be monotonic increasing or decreasing 에러가 뜰 것이다.
        result = pd.concat([result, new_data], ignore_index=True, sort=False)

    print('데이터 로딩 종료: 소요시간 [%d]초' % (time() - stime))

    result['sentiment'] = np.where(result.star_point > 3, '1', '0')

    return result

In [16]:
# 리뷰 리스트 불러오기기
file_list = os.listdir(github_path + '크롤러-전처리/원시자료/')

In [18]:
for file_name in file_list:
    result = pd.read_csv(github_path + '크롤러-전처리/원시자료/' + file_name, encoding='utf-8', engine='python')
    null_text = result.isnull().sum()['text']
    if null_text != 0:
        print(file_name + " : "+ str(null_text))

63 City.txt : 28
Alive Museum Insadong.txt : 22
An Jung-geun Memorial Hall.txt : 1
Ansan Jarak-gil.txt : 3
apm Place.txt : 2
Arario Museum in Space.txt : 9
Baek In-je's House.txt : 9
Banpodaegyo Bridge.txt : 1
Bongeunsa Temple.txt : 44
Bugaksan Seoul Fortress.txt : 11
Bukchon Hanok Village.txt : 54
Changgyeonggung Palace.txt : 36
Cheonggyecheon Stream.txt : 32
Cheongwadae Sarangchae.txt : 1
Children’s Museum of National Museum of Korea.txt : 1
Coex Aquarium.txt : 54
COEX Center.txt : 1
Culture Station Seoul 284.txt : 13
D Museum.txt : 3
Daelim Museum.txt : 5
Deoksugung.txt : 57
Dongdaemun Design Plaza (DDP).txt : 41
Dongdaemun Gate (Heunginjimun).txt : 35
Dongdaemun History & Culture Park.txt : 13
Dongdaemun Seonggwak Park.txt : 2
Dongdaemun Shopping Complex.txt : 59
Doota.txt : 22
Dosan Neighborhood Park.txt : 3
Dream Forest.txt : 3
Eungbongsan Mountain.txt : 2
Ewha Woman's University.txt : 1
Ewha Womans University.txt : 47
Express Bus Terminal Shopping Center.txt : 1
Figure Museum.tx

In [19]:
data_all = all_reviews(github_path, file_list, len(file_list))

데이터 로딩 start
데이터 로딩 종료: 소요시간 [2]초


In [29]:
data_all.iloc[414,:]['title']

'I had fun taking photos here! just more than trick art. A Recommended visit place in Insadong'

In [30]:
data_all.isnull().sum()

star_point               0
title                    3
text                  2916
Date of experience       0
sentiment                0
dtype: int64

In [32]:
nan_index = data_all['text'].isna()
nan_index[33410:33420]

33410     True
33411    False
33412    False
33413    False
33414    False
33415    False
33416     True
33417    False
33418    False
33419     True
Name: text, dtype: bool

In [33]:
data_all[nan_index].head()

Unnamed: 0,star_point,title,text,Date of experience,sentiment
166,3.0,63 City,,July 2019,0
167,4.0,Worth a visit for the view,,March 2019,1
170,5.0,Great view on the top.,,May 2018,1
175,3.0,Tall building,,October 2016,0
181,3.0,Only outside,,July 2016,0


In [34]:
not_nan_index = data_all['text'].notna()

In [37]:
filtered_data = data_all[not_nan_index]
filtered_data.head()

Unnamed: 0,star_point,title,text,Date of experience,sentiment
0,3.0,nothing special,just a tall building in a residential area. no...,February 2020,0
1,4.0,City Views,"Had lunch on 59th floor, outstanding views of ...",October 2019,1
2,3.0,Great Views,Stopped by here as it was along my travel plan...,April 2019,0
3,3.0,Just a tall building,I don't know what I was expecting but this was...,August 2018,0
4,3.0,63 City,Took a 10-15 mins walk from the Yeoinaru stati...,July 2019,0


In [38]:
len(filtered_data)

30510

In [39]:
model = model_tfidf(filtered_data, train_size=0.7, c_value=1.0)

정확도: 0.867
