In [1]:
import pandas as pd

In [73]:
PATH = '../../data/'

In [77]:
reviews = pd.read_csv(PATH + 'processed/' + 'sg_coffee_reviews_final.csv')
reviews['date'] = pd.to_datetime(reviews['date'])
reviews.head()

Unnamed: 0,shop,userid,userinfo,rating,date,photos,username,language,text
0,nylon-coffee-roasters-singapore,AbfS_oXF8H6HJb5jFqhrLw,{'link': '/user_details?userid=AbfS_oXF8H6HJb5...,5.0,2016-04-11,[],Paco G.,en,"A comfortable, no-frills coffee shop right in ..."
1,nylon-coffee-roasters-singapore,ymiohz-DlNLHGAlXbVhw5A,{'link': '/user_details?userid=ymiohz-DlNLHGAl...,4.0,2019-11-11,[{'src': 'https://s3-media0.fl.yelpcdn.com/bph...,Sudipto G.,en,A hard to find coffee place tucked away in the...
2,nylon-coffee-roasters-singapore,-g3XIcCb2b-BD0QBCcq2Sw,{'link': '/user_details?userid=-g3XIcCb2b-BD0Q...,4.0,2019-12-13,[{'src': 'https://s3-media0.fl.yelpcdn.com/bph...,Lisa I.,en,Great little coffee shop that's in the middle ...
3,nylon-coffee-roasters-singapore,gfyW0vuJls8ARoHmogfcmw,{'link': '/user_details?userid=gfyW0vuJls8ARoH...,4.0,2020-02-06,[],Marilyn T.,en,This place came highly recommended by a friend...
4,nylon-coffee-roasters-singapore,jkr3YVCEoRNKkxinUZpW6A,{'link': '/user_details?userid=jkr3YVCEoRNKkxi...,3.0,2015-02-12,[],Joanne G.,en,My great auntie has lived at Everton Park for ...


In [79]:
# drop duplicates
reviews = reviews.sort_values(['userid', 'date'], ascending=[True, True])
reviews = reviews.drop_duplicates(subset=['shop','userid'], keep='last')

In [81]:
MIN_REVIEWS = 3

In [82]:
num_reviews = reviews.groupby(['userid'])['rating'].transform('count')
mask = num_reviews >= MIN_REVIEWS
reviews = reviews.loc[mask]
reviews.head()

Unnamed: 0,shop,userid,userinfo,rating,date,photos,username,language,text
5245,liho-tea-singapore-117,-5YMIME_WEin_by41Bj-3Q,{'link': '/user_details?userid=-5YMIME_WEin_by...,3.0,2021-08-01,[],Jian Hao T.,en,The truffle series drink to me is the worst. L...
1622,old-hen-coffee-bar-singapore-2,-5YMIME_WEin_by41Bj-3Q,{'link': '/user_details?userid=-5YMIME_WEin_by...,4.0,2021-08-05,[],Jian Hao T.,en,Heard a lot about this cafe/bar so decided one...
537,two-men-bagel-house-singapore,-5YMIME_WEin_by41Bj-3Q,{'link': '/user_details?userid=-5YMIME_WEin_by...,4.0,2021-08-07,[],Jian Hao T.,en,"Keeping this review short. Taste is good, but ..."
5554,old-airport-road-food-centre-singapore,-5YMIME_WEin_by41Bj-3Q,{'link': '/user_details?userid=-5YMIME_WEin_by...,5.0,2021-08-07,[],Jian Hao T.,en,Hello Foodies!\n\nThis is a place you must vis...
2824,dutch-colony-coffee-singapore,-5YMIME_WEin_by41Bj-3Q,{'link': '/user_details?userid=-5YMIME_WEin_by...,4.0,2021-08-09,[{'src': 'https://s3-media0.fl.yelpcdn.com/bph...,Jian Hao T.,en,Ordered take away.\n\nCoffee -9/10\nCroissant ...


In [83]:
print('Number of unique users: {}'.format(len(reviews.userid.unique())))
print('Number of unique shops: {}'.format(len(reviews.shop.unique())))
print('Number of ratings: {}'.format(len(reviews)))
sparsity = 1 - len(reviews) / (len(reviews.shop.unique()) * len(reviews.userid.unique()))
print('Sparsity of dataset: {}'.format(sparsity))

Number of unique users: 497
Number of unique shops: 856
Number of ratings: 3816
Sparsity of dataset: 0.9910302939130108


### Leave One Out

In [84]:
reviews = reviews.sort_values(['userid', 'date'], ascending=[True,True])
val = reviews.groupby(['userid']).tail(2).groupby(['userid']).head(1).reset_index(drop=True)
test = reviews.groupby(['userid']).tail(1).reset_index(drop=True)
train = reviews[reviews.groupby(['userid']).cumcount(ascending=False) > 1].reset_index(drop=True)

In [85]:
print('Number of ratings in train set: {}'.format(train.shape[0]))
print('Number of ratings in val set: {}'.format(val.shape[0]))
print('Number of ratings in test set: {}'.format(test.shape[0]))

Number of ratings in train set: 2822
Number of ratings in val set: 497
Number of ratings in test set: 497


In [86]:
train.to_csv(PATH + 'processed/' + 'train_lol.csv', index=False)
val.to_csv(PATH + 'processed/' + 'val_lol.csv', index=False)
test.to_csv(PATH + 'processed/' + 'test_lol.csv', index=False)

### Temporal Global Split

In [94]:
round(len(reviews) / 10)

382

In [112]:
reviews = reviews.sort_values(['date'], ascending=True)
test_len = round(len(reviews) / 10)
val = reviews[-test_len*2:]
test = val.tail(test_len)
val = val.head(test_len)
train = reviews[:-test_len*2]

In [113]:
print('Number of ratings in train set: {}'.format(train.shape[0]))
print('Number of ratings in val set: {}'.format(val.shape[0]))
print('Number of ratings in test set: {}'.format(test.shape[0]))

Number of ratings in train set: 3052
Number of ratings in val set: 382
Number of ratings in test set: 382


In [130]:
def check_dates_min_max(train, val, test):
    assert train.date.max() < val.date.min()
    assert val.date.max() < test.date.min()

    print('Min Max Date Test Passed')

check_dates_min_max(train, val, test)

In [132]:
train.to_csv(PATH + 'processed/' + 'train_tgs.csv', index=False)
val.to_csv(PATH + 'processed/' + 'val_tgs.csv', index=False)
test.to_csv(PATH + 'processed/' + 'test_tgs.csv', index=False)