# Data Preparation for Sentiment Classification of Restaurant Reviews

Use a subset of the full dataset

In [2]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [3]:
args = Namespace(                                 # Create key, value pairs, and access values later on, e.g., args.seed
    raw_train_dataset_csv="data/yelp/raw_train.csv",
    raw_test_dataset_csv="data/yelp/raw_test.csv",
    proportion_subset_of_train=0.01,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/yelp/reviews_with_splits_lite.csv",
    #seed=1337
    seed=1339
)

In [4]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review']) # no header in the file

In [5]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [6]:
train_reviews.rating.value_counts()  # full data

rating
1    280000
2    280000
Name: count, dtype: int64

In [7]:
# making the subset equal across the review classes
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())

# by_rating will be like below:
#  {
#   1 : [{'rating': 1, 'review': 'Just ...'}, {'rating': 1, 'review': 'But ...'}, ...... ], 
#   2 : [{'rating': 2, 'review': 'How ...'}, {'rating': 2, 'review': 'Here ...'}, ...... ]
#  }

In [8]:
review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)  # use only 1% data
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [9]:
review_subset.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [10]:
review_subset.rating.value_counts()  # subset data

rating
1    2800
2    2800
Name: count, dtype: int64

In [11]:
# Unique classes
set(review_subset.rating)

{1, 2}

In [12]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list) # use extend when adding multiple elements to a list; use append for additing one element.

In [13]:
# Write split data to file
final_reviews = pd.DataFrame(final_list)

In [14]:
final_reviews.split.value_counts()

split
train    3918
val       840
test      840
Name: count, dtype: int64

In [15]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)     # E.g., convert "end." to "end . "; \1 indicates a matched character.
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)  # replace special character strings with empty string
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [16]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [17]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,. stars ? ? ? ? really ? ? ? ? ? based on wha...,train
1,negative,"this bar smells like a smoky urinal cake , all...",train
2,negative,you know it s bad when a mcdonald s gets a sta...,train
3,negative,me and my family were in charlotte for my soro...,train
4,negative,very inconsistent .,train


In [18]:
final_reviews.to_csv(args.output_munged_csv, index=False)  # don't write row names (index).