<a href="https://colab.research.google.com/github/vignesh12c/DataViz_analytics_portfolio/blob/main/YelpReviewClassification/yelpDatasetPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Clone repo
!git clone https://github.com/delip/PyTorchNLPBook.git

# Move to data directory
%cd PyTorchNLPBook/chapters/chapter_3/data

# (Optional) Make sure download.py is there
!ls

# Install dependencies
!pip install requests

# Make script executable
!chmod +x get-all-data.sh

# Run script
!bash get-all-data.sh


Cloning into 'PyTorchNLPBook'...
remote: Enumerating objects: 179, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 179 (delta 72), reused 57 (delta 57), pack-reused 89 (from 1)[K
Receiving objects: 100% (179/179), 7.94 MiB | 21.39 MiB/s, done.
Resolving deltas: 100% (95/95), done.
/content/PyTorchNLPBook/chapters/chapter_3/data
download.py  get-all-data.sh  README.md
  if len(sys.argv) is not 3:
Trying to fetch /content/PyTorchNLPBook/chapters/chapter_3/data/yelp/raw_train.csv
1it [00:00, 3530.56it/s]
  if len(sys.argv) is not 3:
Trying to fetch /content/PyTorchNLPBook/chapters/chapter_3/data/yelp/raw_test.csv
848it [00:00, 5207.86it/s]
  if len(sys.argv) is not 3:
Trying to fetch /content/PyTorchNLPBook/chapters/chapter_3/data/yelp/reviews_with_splits_lite.csv
1217it [00:00, 3885.43it/s]
  if len(sys.argv) is not 3:
Trying to fetch /content/PyTorchNLPBook/chapters/chapter_3/data/surnames/surnames.csv
6it [00:

In [None]:
import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace

In [None]:
args = Namespace(
    raw_train_dataset = '/content/PyTorchNLPBook/chapters/chapter_3/data/yelp/raw_test.csv',
    raw_test_dataset = '/content/PyTorchNLPBook/chapters/chapter_3/data/yelp/raw_test.csv',
    proportion_ratio = 0.1,
    train_proportion = 0.7,
    val_proportion = 0.3,
    output_munged_csv = '/content/PyTorchNLPBook/chapters/chapter_3/data/yelp/reviews_with_splits_lite.csv',
    seed = 1337
)

In [None]:
# @title
train_reviews = pd.read_csv(args.raw_train_dataset,header=None,names=['rating','reviews'])
train_reviews = train_reviews[~pd.isnull(train_reviews.reviews)]
test_reviews = pd.read_csv(args.raw_test_dataset,header=None,names= ['rating','reviews'])
test_reviews = test_reviews[~pd.isnull(test_reviews.reviews)]

In [None]:
train_reviews.head()

Unnamed: 0,rating,reviews
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [None]:
test_reviews.head()

Unnamed: 0,rating,reviews
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [None]:
set(train_reviews.rating)

{1, 2}

In [None]:

by_rating = collections.defaultdict(list)
for _,row in train_reviews.iterrows():
  by_rating[row.rating].append(row.to_dict())

In [None]:
final_list = []
np.random.seed(args.seed)

for _,item_list in sorted(by_rating.items()):
  np.random.shuffle(item_list)
  n_total = len(item_list)
  n_train = int(args.train_proportion*n_total)
  n_val = int(args.val_proportion*n_total)
  for item in item_list[:n_train]:
    item['split'] = 'train'
  for item in item_list[n_train:n_train+n_val]:
    item['split'] = 'val'
  final_list.extend(item_list)

In [None]:
for _,row in test_reviews.iterrows():
  row_dict = row.to_dict()
  row_dict['split'] = 'test'
  final_list.append(row_dict)


In [None]:
final_reviews = pd.DataFrame(final_list)

In [None]:
final_reviews.split.value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
test,38000
train,26600
val,11400


In [None]:
# Preprocess the reviews
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

final_reviews.reviews = final_reviews.reviews.apply(preprocess_text)

In [None]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [None]:
final_reviews.head()

Unnamed: 0,rating,reviews,split
0,negative,check in was long even at night . once through...,train
1,negative,i don t even want to give them a one star . . ...,train
2,negative,this panda express location is a difficult one...,train
3,negative,i really wanted to like this place but i guess...,train
4,negative,my friend and i were walking the strip waiting...,train


In [None]:
final_reviews.to_csv(args.output_munged_csv, index=False)