## Data cleaning

In [46]:
import json
import gzip
import ast

item_file_path = 'meta-Michigan.json.gz'
review_file_path = 'review-Michigan.json.gz'

item_data = []

with gzip.open(item_file_path, 'rt', encoding='utf-8') as f:
    for line in f:
        temp = json.loads(line)
        item_data.append({'gmap_id':temp['gmap_id'], 'name':temp['name'], 'description':temp['description']})

In [47]:
review_data = []

with gzip.open(review_file_path, 'rt', encoding='utf-8') as f:
    for line in f:
        temp = json.loads(line)
        review_data.append({'user_id': temp['user_id'], 'rating': temp['rating'], 'text': temp['text'], 'gmap_id': temp['gmap_id'],'time':temp['time']})

In [48]:
import pandas as pd

item_df = pd.DataFrame(item_data)
review_df = pd.DataFrame(review_data)

In [65]:
item_df_clean = item_df[item_df['description'].notnull()].copy()
item_df_clean['wc'] = item_df_clean['description'].apply(lambda x: len(x.split()))
item_df_clean = item_df_clean[(item_df_clean['wc'] <= 512) & (item_df_clean['wc'] > 0)]
review_df_clean = review_df[review_df['text'].notnull() & review_df['gmap_id'].isin(item_df_clean['gmap_id'].unique())].copy()
review_df_clean['wc'] = review_df_clean['text'].apply(lambda x: len(x.split()))
review_df_clean = review_df_clean[(review_df_clean['wc'] <= 512) & (review_df_clean['wc'] > 0)]
item_df_clean = item_df_clean[~item_df_clean['description'].isin(['None', 'none', 'Null'])]
review_df_clean = review_df_clean[~review_df_clean['text'].isin(['None', 'none', 'Null'])]

## Data set preparation

In [66]:
review_df_filtered = review_df_clean.copy()
curr_len = len(review_df_filtered)
prev_len = float('inf')
while prev_len > curr_len:
    prev_len = curr_len
    review_df_filtered = review_df_filtered.groupby('user_id').filter(lambda x: len(x) >= 30)
    review_df_filtered = review_df_filtered.groupby('gmap_id').filter(lambda x: len(x) >= 30)
    curr_len = len(review_df_filtered)

In [69]:
len(review_df_filtered)

1185340

In [71]:
print(len(review_df_filtered['user_id'].unique()))
print(len(review_df_filtered['gmap_id'].unique()))

22642
11399


In [73]:
item_30 = item_df_clean[item_df_clean['gmap_id'].isin(review_df_filtered['gmap_id'].unique())].copy()
review_30 = review_df_filtered.copy()

In [74]:
from sklearn.model_selection import train_test_split
import pandas as pd


review_30 = review_30[review_30['text'].notnull()]
item_30 = item_30[item_30['description'].notnull()]
user_id_encoder = {user_id: i for i, user_id in enumerate(list(review_30['user_id'].unique()))}
item_id_encoder = {item_id: i for i, item_id in enumerate(list(review_30['gmap_id'].unique()))}
review_30['user_id'] = review_30['user_id'].map(user_id_encoder)
review_30['gmap_id'] = review_30['gmap_id'].map(item_id_encoder)
item_30['gmap_id'] = item_30['gmap_id'].map(item_id_encoder)
review_train, review_test = train_test_split(review_30, test_size=0.2, random_state=42)
review_30.to_csv('review_30.csv')
item_30.to_csv('item_30.csv')
review_train.to_csv('review_train.csv')
review_test.to_csv('review_test.csv')

In [76]:
review_30

Unnamed: 0,user_id,rating,text,gmap_id,time,wc
161099,0,5.0,Classy ambience and good selection of beers. L...,0,1627784044398,11
161106,1,5.0,The food is very very good the waitress was ve...,0,1620615147957,37
161110,2,5.0,"The food was good, the waitress was attentive,...",0,1628993853261,12
161115,3,5.0,It was my second visit. Food is great along w...,0,1603584847324,44
161125,4,5.0,Great food and waitress and the best,0,1622469114335,7
...,...,...,...,...,...,...
20775587,10974,4.0,Not bad,11398,1538264914967,2
20775588,18878,3.0,Limited beer selection.,11398,1529011792581,3
20775590,13668,3.0,Good food,11398,1526441037139,2
20775599,17933,4.0,Always great steak!!,11398,1520603949234,3


In [78]:
test = pd.read_csv('review_30.csv', index_col=0)
test

Unnamed: 0,user_id,rating,text,gmap_id,time,wc
161099,0,5.0,Classy ambience and good selection of beers. L...,0,1627784044398,11
161106,1,5.0,The food is very very good the waitress was ve...,0,1620615147957,37
161110,2,5.0,"The food was good, the waitress was attentive,...",0,1628993853261,12
161115,3,5.0,It was my second visit. Food is great along w...,0,1603584847324,44
161125,4,5.0,Great food and waitress and the best,0,1622469114335,7
...,...,...,...,...,...,...
20775587,10974,4.0,Not bad,11398,1538264914967,2
20775588,18878,3.0,Limited beer selection.,11398,1529011792581,3
20775590,13668,3.0,Good food,11398,1526441037139,2
20775599,17933,4.0,Always great steak!!,11398,1520603949234,3


In [80]:
review_30[review_30['text']=='None']

Unnamed: 0,user_id,rating,text,gmap_id,time,wc
