In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split

from transformers import TFBertModel, BertTokenizer
from transformers import TFBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [100]:
df_fashion = pd.read_csv("../data/01. 패션.csv", encoding='utf-8', sep=",")
df_cosmetic = pd.read_csv("../data/02. 화장품.csv", encoding='utf-8', sep=",")
df_appliance = pd.read_csv("../data/03. 가전.csv", encoding='utf-8', sep=",")
df_it = pd.read_csv("../data/04. IT기기.csv", encoding='utf-8', sep=",")

result = pd.concat([df_fashion, df_cosmetic, df_appliance, df_it])

In [5]:
#ReviewScore 80, 4 이상 긍정적, 50이하, 2이하 부정적으로 지정

def tokenize_korean_text(text): 
  text_filtered = re.sub('[^,.?!\w\s]','', text)

  okt = Okt() 
  Okt_morphs = okt.pos(text_filtered) 

  words = []
  for word, pos in Okt_morphs:
    if pos == 'Adjective' or pos == 'Verb' or pos == 'Noun':
      words.append(word)

  words_str = ' '.join(words)
  return words_str

#### label 은 평점 기준으로 긍정이면 1, 부정이면 0으로 라벨링 지정

#### 학습용 데이터로 가공
- 평점 8 이상 혹은 3 이하만 저장 (8 이상: 긍정적, 3 이하: 부정적)
- 각 text를 tokenize한 후, 동사, 형용사, 명사만 저장 (konlpy의 Okt 사용)

In [78]:
df = df_it
df['label'] = 0
df['label'] = df['Score_change'].apply(lambda x: 1 if x > 3.5 else 0)

df['Score_change'] = df['ReviewScore'].replace({100:5,90:4.5,80:4.0,70:3.5,60:3.0,50:2.5,40:2.0,30:1.5,20:1.0,10:0.5,0:0})

NameError: name 'df_itdf' is not defined

In [10]:
X_texts = []
y = []

for Score_change, RawText in zip(df['Score_change'], df['RawText']):
  tokenized_comment = tokenize_korean_text(RawText)  # 위에서 만들었던 함수로 comment 쪼개기
  X_texts.append(tokenized_comment)
  y.append(1 if Score_change > 3.5 else 0)

# train_test_split
X_train_texts, X_test_texts, y_train, y_test = train_test_split(X_texts, y, test_size=0.2, random_state=0)

In [75]:
df['label'].value_counts()

label
1    39021
0     2336
Name: count, dtype: int64

In [14]:
# CountVectorizer로 vector화
tf_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
X_train_tf = tf_vectorizer.fit_transform(X_train_texts)  # training data에 맞게 fit & training data를 transform
X_test_tf = tf_vectorizer.transform(X_test_texts) # test data를 transform

vocablist = [word for word, number in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]  # 단어들을 번호 기준 내림차순으로 저장

In [17]:
logistic_model = LogisticRegression(C=0.1, penalty='l2', random_state=0)
logistic_model.fit(X_train_tf, y_train)  # 학습

In [18]:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [73]:
y_test_pred = logistic_model.predict(X_test_tf)

print('Misclassified samples: {} out of {}'.format((y_test_pred != y_test).sum(), len(y_test)))
print(f'Accuracy: {accuracy_score(y_test, y_test_pred)}')

Misclassified samples: 472 out of 8272
Accuracy: 0.9429400386847195


In [20]:
y_pred_series = pd.Series(y_test_pred)
value_counts = y_pred_series.value_counts()
print(value_counts)

1    8192
0      80
Name: count, dtype: int64


In [21]:
coefficients = logistic_model.coef_.tolist()
sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)
# coefficients(계수)가 큰 값부터 내림차순으로 정렬

print('긍정적인 단어 Top 10 (높은 평점과 상관관계가 강한 단어들)')
for word_num, coef in sorted_coefficients[:10]:
  print('{0:}({1:.3f})'.format(vocablist[word_num], coef))

print('\n부정적인 단어 Top 10 (낮은 평점과 상관관계가 강한 단어들)')
for word_num, coef in sorted_coefficients[-10:]: 
  print('{0:}({1:.3f})'.format(vocablist[word_num], coef))

긍정적인 단어 Top 10 (높은 평점과 상관관계가 강한 단어들)
최고(0.830)
좋아요(0.768)
좋습니다(0.766)
만족해요(0.743)
좋네요(0.726)
만족(0.669)
만족합니다(0.660)
만족스러워요(0.641)
감사해요(0.625)
감사합니다(0.604)

부정적인 단어 Top 10 (낮은 평점과 상관관계가 강한 단어들)
화나네요(-0.680)
안되서(-0.684)
별로(-0.695)
불만족(-0.736)
비추(-0.753)
반품(-0.762)
짜증나네요(-0.792)
안되고(-0.818)
실망(-1.053)
최악(-1.086)


In [22]:
# 긍정/부정 테스트용 함수 생성
def guess_good_or_bad(model, text):
    text_filtered = text.replace('.', '').replace(',', '').replace("'", "").replace('·', ' ').replace('=', '')
    okt = Okt()  # Corrected line
    Okt_morphs = okt.pos(text_filtered)

    words = []
    for word, pos in Okt_morphs:
        if pos == 'Adjective' or pos == 'Verb' or pos == 'Noun':
            words.append(word)
    words_str = ' '.join(words)

    # Assuming you have defined tf_vectorizer and model somewhere in your code
    new_text_tf = tf_vectorizer.transform([words_str])
    result = model.predict(new_text_tf)[0]

    if result == 1:
        print('긍정')
    else:
        print('부정')

In [23]:
guess_good_or_bad(logistic_model, '안되서 화나네요 최악입니다')

부정


In [24]:
guess_good_or_bad(logistic_model, '좋아요 최고')

긍정


### 모델 저장

In [25]:
import joblib

In [26]:
model_filename = 'logistic_model.joblib'
vectorizer_filename = 'vectorizer.joblib'
joblib.dump(logistic_model, model_filename)
joblib.dump(tf_vectorizer, vectorizer_filename)

['vectorizer.joblib']

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [28]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_tf, y_train)  # 학습
y_test_pred = naive_bayes_model.predict(X_test_tf)
print(f'Accuracy: {accuracy_score(y_test, y_test_pred)}')

Accuracy: 0.940522243713733


In [29]:
y_pred_series = pd.Series(y_test_pred)
value_counts = y_pred_series.value_counts()
print(value_counts)

1    8052
0     220
Name: count, dtype: int64


In [30]:
# Access feature log probabilities
feature_log_probs = naive_bayes_model.feature_log_prob_

# Calculate the difference between positive and negative log probabilities
log_prob_diff = feature_log_probs[1] - feature_log_probs[0]

# Create a list of (word, log_prob_diff) tuples
word_log_prob_diff = list(zip(vocablist, log_prob_diff))

# Sort the list by log_prob_diff in descending order
sorted_word_log_prob_diff = sorted(word_log_prob_diff, key=lambda x: x[1], reverse=True)

print('긍정적인 단어 Top 10 (높은 평점과 상관관계가 강한 단어들)')
for word, log_prob_diff in sorted_word_log_prob_diff[:10]:
    print('{0}: {1:.3f}'.format(word, log_prob_diff))

print('\n부정적인 단어 Top 10 (낮은 평점과 상관관계가 강한 단어들)')
for word, log_prob_diff in sorted_word_log_prob_diff[-10:]:
    print('{0}: {1:.3f}'.format(word, log_prob_diff))


긍정적인 단어 Top 10 (높은 평점과 상관관계가 강한 단어들)
감사해요: 3.818
감사합니다: 3.467
다니기: 2.876
안심: 2.870
편리하고: 2.822
꼼꼼하게: 2.690
매력: 2.654
이쁩니다: 2.639
효율: 2.570
기뻐요: 2.562

부정적인 단어 Top 10 (낮은 평점과 상관관계가 강한 단어들)
받지도: -3.652
보내라: -3.652
빵점: -3.652
억지로: -3.652
인쇄물: -3.652
참조: -3.652
못잡고: -3.876
성질: -3.876
저장장치: -3.876
콜센터: -3.876


In [31]:
guess_good_or_bad(naive_bayes_model, '좋아요 최고')

긍정


In [32]:
guess_good_or_bad(naive_bayes_model, '좋아요 최고')

긍정


In [33]:
model_filename = 'naive_bayes_model.joblib'
vectorizer_filename = 'vectorizer.joblib'
joblib.dump(naive_bayes_model, model_filename)
joblib.dump(tf_vectorizer, vectorizer_filename)

['vectorizer.joblib']

#### GPU를 사용할 수 있는지 확인

In [106]:
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("GPU is available")
else:
    print("GPU is not available")

GPU is not available


### BERT model 토크나이저


In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### 토큰화
def encode(data, tokenizer):
    input_ids = []
    attention_masks = []
    token_type_ids = []
 
    for text in data:
        tokenized_text = tokenizer.encode_plus(text,
                                            max_length=50,
                                            add_special_tokens = True,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,
                                              truncation=True)
        
        input_ids.append(tokenized_text['input_ids'])
        attention_masks.append(tokenized_text['attention_mask'])
        token_type_ids.append(tokenized_text['token_type_ids'])
    
    return input_ids, attention_masks, token_type_ids

### BERT 모델 입력을 위한 형태로 처리 

#딕셔너리 형태로 변환해서 출력 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
      }, label
      
 #데이터를 BERT에 넣을 수 있는 형태로 변경 
def data_encode(input_ids_list, attention_mask_list, token_type_ids_list, label_list):
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

## 1. 데이터 전처리
- 평점에 따라 긍1, 부0으로 변경

In [107]:
result['Score_change'] = result['ReviewScore'].replace({100:5,90:4.5,80:4.0,70:3.5,60:3.0,50:2.5,40:2.0,30:1.5,20:1.0,10:0.5,0:0})
result['label'] = result['Score_change'].apply(lambda x: 1 if x > 3.5 else 0)

In [110]:
result['Domain'].unique()

array(['패션', '화장품', '가전', 'IT기기'], dtype=object)

-----

### 1-1 패션 데이터

In [102]:
df = result[result['Domain']== '패션']
# train_test_split
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['RawText'], df['label'], test_size=0.2, random_state=0)

# 레이블을 정수형으로 변환
y_train = [int(label) for label in y_train]
y_test = [int(label) for label in y_test]

# 레이블을 넘파이 배열로 변환
y_train = np.array(y_train)
y_test = np.array(y_test)

print(X_train_texts.shape, X_test_texts.shape, y_train.shape, y_test.shape)
pd.Series(y_train).value_counts()

(35996,) (9000,) (35996,) (9000,)


1    28952
0     7044
Name: count, dtype: int64

##### encode 함수를 정의해서 토큰화를 실시한다. 
여기서는 token_type_ids 정보도 추출하는데, 이는 각 토큰의 문장 임베딩 정보를 포함하고 있다. 여기서는 리뷰가 한개씩 입력되지만, 원래 BERT모델은 두 개의 문장을 입력받기 때문에 동일한 구조로 사용하기 위해서 해당 정보도 추출한다. 

In [40]:
BATCH_SIZE = 32

#학습데이터 토큰화
train_input_ids, train_attention_masks, train_token_type_ids = encode(X_train_texts, tokenizer)
#테스트데이터 토큰화
test_input_ids, test_attention_masks, test_token_type_ids = encode(X_test_texts, tokenizer)

# 학습 데이터셋 생성
train_data_encoded = data_encode(train_input_ids, train_attention_masks, train_token_type_ids, y_train).shuffle(10000).batch(BATCH_SIZE)
# 테스트 데이터셋 생성
test_data_encoded = data_encode(test_input_ids, test_attention_masks, test_token_type_ids, y_test).batch(BATCH_SIZE)

#### 모델 학습 GPU

In [None]:
# 모델 정의
with tf.device('/GPU:0'):
    BERT_model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
        num_labels = 2, # The number of output labels--2 for binary classification.
    )

    # 올바른 패키지에서 가져온 옵티마이저 사용
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    BERT_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # 모델 훈련
    NUM_EPOCHS = 5
    history = BERT_model.fit(train_data_encoded, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=test_data_encoded)

# 모델 저장
BERT_model.save("bert_model_fashion")

In [61]:
# 모델 불러오기
loaded_model = tf.keras.models.load_model("bert_model_fashion")
loaded_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109482240 (417.64 MB)
Trainable params: 109482240 (417.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


------

### 1-2 화장품 데이터

In [None]:
df = result[result['Domain']== '화장품']
# train_test_split
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['RawText'], df['label'], test_size=0.2, random_state=0)

# 레이블을 정수형으로 변환
y_train = [int(label) for label in y_train]
y_test = [int(label) for label in y_test]

# 레이블을 넘파이 배열로 변환
y_train = np.array(y_train)
y_test = np.array(y_test)

print(X_train_texts.shape, X_test_texts.shape, y_train.shape, y_test.shape)
pd.Series(y_train).value_counts()

In [None]:
#학습데이터 토큰화
train_input_ids, train_attention_masks, train_token_type_ids = encode(X_train_texts, tokenizer)
#테스트데이터 토큰화
test_input_ids, test_attention_masks, test_token_type_ids = encode(X_test_texts, tokenizer)

# 학습 데이터셋 생성
train_data_encoded = data_encode(train_input_ids, train_attention_masks, train_token_type_ids, y_train).shuffle(10000).batch(BATCH_SIZE)
# 테스트 데이터셋 생성
test_data_encoded = data_encode(test_input_ids, test_attention_masks, test_token_type_ids, y_test).batch(BATCH_SIZE)

# 모델 정의
with tf.device('/GPU:0'):
    BERT_model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
        num_labels = 2, # The number of output labels--2 for binary classification.
    )

    # 올바른 패키지에서 가져온 옵티마이저 사용
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    BERT_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # 모델 훈련
    NUM_EPOCHS = 5
    history = BERT_model.fit(train_data_encoded, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=test_data_encoded)

# 모델 저장
BERT_model.save("bert_model_fashion")

# 모델 불러오기
loaded_model = tf.keras.models.load_model("bert_model_fashion")
loaded_model.summary()

## 라이브러리 버전

In [None]:
# 출처
# https://yeong-jin-data-blog.tistory.com/entry/BERT%EB%A1%9C-%EC%98%81%ED%99%94-%EB%A6%AC%EB%B7%B0-%EA%B0%90%EC%84%B1-%EB%B6%84%EC%84%9D%ED%95%98%EA%B8%B0

In [88]:
import pkg_resources

# 설치된 패키지 목록 불러오기
installed_packages = pkg_resources.working_set

# 패키지 이름과 버전 출력
for package in installed_packages:
    print(f"{package.key}=={package.version}")


babel==2.11.0
brotli==1.0.9
jpype1==1.5.0
jinja2==3.1.2
markdown==3.6
markupsafe==2.1.3
pyqt5==5.15.10
pyqt5-sip==12.13.0
pysocks==1.7.1
pyyaml==6.0.1
pygments==2.15.1
qtpy==2.4.1
send2trash==1.8.2
absl-py==2.1.0
anyio==3.5.0
archspec==0.2.1
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.5
astunparse==1.6.3
async-lru==2.0.4
attrs==23.1.0
beautifulsoup4==4.12.2
bleach==4.1.0
boltons==23.0.0
certifi==2023.11.17
cffi==1.16.0
charset-normalizer==2.0.4
click==8.1.7
colorama==0.4.6
comm==0.1.2
conda==23.11.0
conda-content-trust==0.2.0
conda-libmamba-solver==23.12.0
conda-package-handling==2.2.0
conda-package-streaming==0.9.0
contourpy==1.2.0
cryptography==41.0.7
cycler==0.12.1
debugpy==1.6.7
decorator==5.1.1
defusedxml==0.7.1
distro==1.8.0
executing==0.8.3
fastjsonschema==2.16.2
filelock==3.13.1
flatbuffers==23.5.26
fonttools==4.47.2
fsspec==2024.2.0
gast==0.5.4
google-pasta==0.2.0
grpcio==1.62.1
h5py==3.10.0
huggingface-hub==0.22.2
idna==3.4
ipykernel==6.25.0
ipython==8.20.

babel==2.11.0
brotli==1.0.9
jpype1==1.5.0
jinja2==3.1.2
markdown==3.6
markupsafe==2.1.3
pyqt5==5.15.10
pyqt5-sip==12.13.0
pysocks==1.7.1
pyyaml==6.0.1
pygments==2.15.1
qtpy==2.4.1
send2trash==1.8.2
absl-py==2.1.0
anyio==3.5.0
archspec==0.2.1
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.5
astunparse==1.6.3
async-lru==2.0.4
attrs==23.1.0
beautifulsoup4==4.12.2
bleach==4.1.0
boltons==23.0.0
certifi==2023.11.17
cffi==1.16.0
charset-normalizer==2.0.4
click==8.1.7
colorama==0.4.6
comm==0.1.2
conda==23.11.0
conda-content-trust==0.2.0
conda-libmamba-solver==23.12.0
conda-package-handling==2.2.0
conda-package-streaming==0.9.0
contourpy==1.2.0
cryptography==41.0.7
cycler==0.12.1
debugpy==1.6.7
decorator==5.1.1
defusedxml==0.7.1
distro==1.8.0
executing==0.8.3
fastjsonschema==2.16.2
filelock==3.13.1
flatbuffers==23.5.26
fonttools==4.47.2
fsspec==2024.2.0
gast==0.5.4
google-pasta==0.2.0
grpcio==1.62.1
h5py==3.10.0
huggingface-hub==0.22.2
idna==3.4
ipykernel==6.25.0
ipython==8.20.0
ipywidgets==8.0.4
jedi==0.18.1
joblib==1.3.2
json5==0.9.6
jsonpatch==1.32
jsonpointer==2.1
jsonschema==4.19.2
jsonschema-specifications==2023.7.1
jupyter==1.0.0
jupyter-client==8.6.0
jupyter-console==6.6.3
jupyter-core==5.5.0
jupyter-events==0.8.0
jupyter-lsp==2.2.0
jupyter-server==2.10.0
jupyter-server-terminals==0.4.4
jupyterlab==4.0.8
jupyterlab-pygments==0.1.2
jupyterlab-server==2.25.1
jupyterlab-widgets==3.0.9
keras==3.1.1
kiwisolver==1.4.5
konlpy==0.6.0
libclang==18.1.1
libmambapy==1.5.3
lightgbm==4.2.0
lxml==5.1.1
markdown-it-py==3.0.0
matplotlib==3.8.2
matplotlib-inline==0.1.6
mdurl==0.1.2
mediapipe==0.10.9
menuinst==2.0.1
mistune==2.0.4
mkl-fft==1.3.8
mkl-random==1.2.4
mkl-service==2.4.0
ml-dtypes==0.3.2
mpmath==1.3.0
namex==0.0.7
nbclient==0.8.0
nbconvert==7.10.0
nbformat==5.9.2
nest-asyncio==1.5.6
networkx==3.2.1
nltk==3.8.1
notebook==7.0.6
notebook-shim==0.2.3
numpy==1.26.3
opencv-contrib-python==4.9.0.80
opencv-python==4.9.0.80
opt-einsum==3.3.0
optree==0.11.0
overrides==7.4.0
packaging==23.1
pandas==2.1.4
pandocfilters==1.5.0
parso==0.8.3
pillow==10.2.0
pip==23.3.1
platformdirs==3.10.0
pluggy==1.0.0
ply==3.11
prometheus-client==0.14.1
prompt-toolkit==3.0.43
protobuf==3.20.3
psutil==5.9.0
pure-eval==0.2.2
pyopenssl==23.2.0
pycosat==0.6.6
pycparser==2.21
pyparsing==3.1.1
python-dateutil==2.8.2
python-json-logger==2.0.7
pytz==2023.3.post1
pywin32==305.1
pywinpty==2.0.10
pyzmq==25.1.0
qtconsole==5.5.0
referencing==0.30.2
regex==2023.12.25
requests==2.31.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==13.7.1
rpds-py==0.10.6
ruamel.yaml==0.17.21
safetensors==0.4.2
scikit-learn==1.3.2
scipy==1.11.4
seaborn==0.13.1
setuptools==68.2.2
sip==6.7.12
six==1.16.0
sniffio==1.2.0
sounddevice==0.4.6
soupsieve==2.5
stack-data==0.2.0
sympy==1.12
tensorboard==2.16.2
tensorboard-data-server==0.7.2
tensorflow==2.16.1
tensorflow-intel==2.16.1
tensorflow-io-gcs-filesystem==0.31.0
termcolor==2.4.0
terminado==0.17.1
tf-keras==2.16.0
threadpoolctl==3.2.0
tinycss2==1.2.1
tokenizers==0.15.2
torch==2.2.0
torchaudio==2.2.0
torchvision==0.17.0
tornado==6.3.3
tqdm==4.65.0
traitlets==5.7.1
transformers==4.39.2
truststore==0.8.0
typing-extensions==4.9.0
tzdata==2023.4
urllib3==1.26.18
wcwidth==0.2.5
webencodings==0.5.1
websocket-client==0.58.0
werkzeug==3.0.1
wheel==0.41.2
widgetsnbextension==4.0.5
win-inet-pton==1.1.0
wordcloud==1.9.3
wrapt==1.16.0
zstandard==0.19.0