<a href="https://colab.research.google.com/github/zzhenxi/study-NLP-with-PyTorch/blob/main/%5BChapter3%5D_%EB%A0%88%EC%8A%A4%ED%86%A0%EB%9E%91_%EB%A6%AC%EB%B7%B0_%EA%B0%90%EC%84%B1_%EB%B6%84%EB%A5%98_(%EB%8D%B0%EC%9D%B4%ED%84%B0_%EC%A0%84%EC%B2%98%EB%A6%AC).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 불러오기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [106]:
args = Namespace(
    raw_train_dataset_csv='/content/drive/MyDrive/my_dataset/yelp_review/raw_train.csv',
    raw_test_dataset_csv='/content/drive/MyDrive/my_dataset/yelp_review/raw_test.csv',
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv='/content/drive/MyDrive/my_dataset/yelp_review/my_reviews_with_splits_lite.csv', # my 추가함 (여기서 생성한 파일)
    seed=1337
)

In [4]:
# 원본 데이터 읽기 
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [94]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [95]:
train_reviews.rating.value_counts()

1    280000
2    280000
Name: rating, dtype: int64

In [96]:
# 리뷰 클래스 비율이 동일하도록 만듭니다
# defaultdict(list) : default 값이 빈 리스트 []인 딕셔너리 

# rating끼리 모으기
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
  by_rating[row.rating].append(row.to_dict())

review_subset = []

for _, item_list in sorted(by_rating.items()):

  n_total = len(item_list)
  n_subset = int(args.proportion_subset_of_train * n_total)
  review_subset.extend(item_list[:n_subset]) # 자세히는 모르겠지만,, 일부를 추출해서 또 추가해주는 것 같다 (비율을 맞추기 위해)

review_subset = pd.DataFrame(review_subset)

In [97]:
review_subset.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [98]:
train_reviews.rating.value_counts()

1    280000
2    280000
Name: rating, dtype: int64

In [99]:
# 고유 클래스 
set(review_subset.rating)

{1, 2}

In [100]:
# 훈련, 검증, 테스트를 만들기 위해 별점을 기준으로 나눕니다.
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())

# 분할 데이터 만들기 
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
  np.random.shuffle(item_list)

  n_total = len(item_list)

  n_total = len(item_list)
  n_train = int(args.train_proportion * n_total)
  n_val = int(args.val_proportion * n_total)
  n_test = int(args.test_proportion * n_total)

  # 데이터 포인터에 분할 속성(열)을 추가합니다
  for item in item_list[:n_train]:
    item['split'] = 'train'

  for item in item_list[n_train:n_train+n_val]:
    item['split'] = 'val'
  
  for item in item_list[n_train+n_val:n_val+n_test]:
    item['split'] = 'test'
  
  # 최종 리스트에 추가
  final_list.extend(item_list)


In [101]:
# 분할 데이터를 데이터 프레임으로 만듭니다
final_reviews = pd.DataFrame(final_list)

In [102]:
final_reviews.split.value_counts()

train    39200
val       8400
Name: split, dtype: int64

In [103]:
# 리뷰 전처리
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r"([.,!?])", r" \1 ", text)
  text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
  return text

final_reviews.review = final_reviews.review.apply(preprocess_text)

In [104]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get) # 아직 이해를 못함! get.. 뭘까?

In [105]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


In [107]:
final_reviews.to_csv(args.output_munged_csv, index=False)