In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import *
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import nltk
import re

## Data Preparation


In [2]:
reviews = load_json_data('../../data/raw/yelp_academic_dataset_review.json')

In [3]:
reviews = reviews[['text', 'stars']]

In [5]:
# casting target class to int data type
reviews['stars'] = reviews['stars'].astype('int')

In [6]:
reviews.head()

Unnamed: 0,text,stars
0,"If you decide to eat here, just be aware it is...",3
1,I've taken a lot of spin classes over the year...,5
2,Family diner. Had the buffet. Eclectic assortm...,3
3,"Wow! Yummy, different, delicious. Our favo...",5
4,Cute interior and owner (?) gave us tour of up...,4


### Splitting the data
Creating a balanced dataset for training, and leaving the validation and testing data imbalanced

In [7]:
df_train_raw, df_test = train_test_split(reviews, test_size=45000, random_state=39, shuffle=True)
df_train_raw, df_val = train_test_split(df_train_raw, test_size=45000, random_state=39, shuffle=True)

In [8]:
def undersample(df, class_size):
    dfs = []

    for label in df['stars'].value_counts().keys():
        df_class = df[df['stars'] == label]
        undersampled_class = resample(df_class, replace=False, n_samples=class_size, random_state=39)
        dfs.append(undersampled_class)

    return pd.concat(dfs).sample(frac=1, random_state=39)

In [9]:
df_train = undersample(df_train_raw, 9000)

In [10]:
df_train.stars.value_counts()

4    9000
1    9000
2    9000
5    9000
3    9000
Name: stars, dtype: int64

In [11]:
df_train.shape

(45000, 2)

In [12]:
df_test.shape

(45000, 2)

In [13]:
df_train.to_parquet('../../data/processed/train.parquet', index=False)
df_test.to_parquet('../../data/processed/test.parquet', index=False)
df_val.to_parquet('../../data/processed/val.parquet', index=False)

: 