# 0
I've got a few interesting datasets, to do well in the coursework I want to implement some pretty complicated techniques so I'm going to need some pretty complicated datasets.

Requirements:
- Density
- Many reviews
- Feature data
- Contextual data
- Implicit data


In [1]:
import pandas as pd
from collections import Counter

In [4]:
def make_df_from_count(serie,name):
    counts = dict(Counter(serie))
    return pd.DataFrame.from_dict(counts,orient='index').reset_index().rename(columns={'index':name,0:f'{name}_count'})

## 1.1 Beer reviews
Pros
+ Feature data
+ Many reviews

Mediums
* Temporal context data

Cons
- Density
- Implicit data

In [3]:
beer_reviews = pd.read_csv('../data-beer-reviews/beer_reviews.csv')
# beer_reviews.head()

In [19]:
print(f"number of reviews: {beer_reviews.shape}")
print(f"columns: {list(beer_reviews.columns)}")
print(f"unique items: {len(pd.unique(beer_reviews.beer_name))}")
print(f"unique reviewers: {len(pd.unique(beer_reviews.review_profilename))}")
print(f"Sparsity: {round(100*beer_reviews.shape[0] / (len(pd.unique(beer_reviews.beer_name)) * len(pd.unique(beer_reviews.review_profilename))), 3)}%")
n = 2
b_df = make_df_from_count(beer_reviews.beer_name,'beer_name')
print(f"items with fewer than {n} reviews: {round(100*len(b_df[b_df['beer_name_count'] < n])/len(b_df), 2)}%")
u_df = make_df_from_count(beer_reviews.review_profilename, 'review_profilename')
print(f"users with fewer than {n} reviews: {round(100*len(u_df[u_df['review_profilename_count'] < n])/len(u_df), 2)}%")

number of reviews: (1586614, 13)
columns: ['brewery_id', 'brewery_name', 'review_time', 'review_overall', 'review_aroma', 'review_appearance', 'review_profilename', 'beer_style', 'review_palate', 'review_taste', 'beer_name', 'beer_abv', 'beer_beerid']
unique items: 56857
unique reviewers: 33388
Sparsity: 0.084%
beers with fewer than 2 reviews: 33.26%
users with fewer than 2 reviews: 31.28%


In [29]:
for column in list(beer_reviews.columns):
    print(f"{column} completeness: {round(100*(1-(beer_reviews[column].isna().sum()/beer_reviews.shape[0])),3)}%")

brewery_id completeness: 100.0%
brewery_name completeness: 99.999%
review_time completeness: 100.0%
review_overall completeness: 100.0%
review_aroma completeness: 100.0%
review_appearance completeness: 100.0%
review_profilename completeness: 99.978%
beer_style completeness: 100.0%
review_palate completeness: 100.0%
review_taste completeness: 100.0%
beer_name completeness: 100.0%
beer_abv completeness: 95.728%
beer_beerid completeness: 100.0%


## 1.2 Beer, Liquor and Wine reviews
Pros
+ Density
+ Contextual data (time, location)
+ Implicit data

Mediums
* Feature data

Cons
- Number of reviews

In [20]:
b_l_w1 = pd.read_csv('../data-beer-liquor-wine/wine reviews.csv')
b_l_w2 = pd.read_csv('../data-beer-liquor-wine/447_1.csv').drop(columns=['primaryCategories', 'quantities'])
blw = pd.concat([b_l_w1, b_l_w2])

blw["descriptions"] = blw["descriptions"].apply(lambda s: 'Carmex' if 'Carmex' in str(s) else s)
blw = blw[blw['descriptions'] != 'Carmex']

# blw.head()

In [32]:
print(f"columns: {list(blw.columns)}")
print(f"number of reviews: {blw.shape}")
print(f"unique items: {len(pd.unique(blw.name))}")
print(f"unique reviewers: {len(pd.unique(blw['reviews.username']))}")
print(f"Sparsity: {round(100*blw.shape[0] / (len(pd.unique(blw.name)) * len(pd.unique(blw['reviews.username']))), 3)}%")
n = 2
b_df = make_df_from_count(blw.name,'name')
print(f"items with fewer than {n} reviews: {round(100*len(b_df[b_df['name_count'] < n])/len(b_df), 2)}%")
u_df = make_df_from_count(blw['reviews.username'], 'reviews.username')
print(f"users with fewer than {n} reviews: {round(100*len(u_df[u_df['reviews.username_count'] < n])/len(u_df), 2)}%\n")
for column in list(blw.columns):
    print(f"{column} completeness: {round(100*(1-(blw[column].isna().sum()/blw.shape[0])),3)}%")

columns: ['id', 'asins', 'brand', 'categories', 'dateAdded', 'dateUpdated', 'descriptions', 'dimension', 'ean', 'flavors', 'keys', 'manufacturer', 'manufacturerNumber', 'name', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id', 'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title', 'reviews.userCity', 'reviews.userProvince', 'reviews.username', 'sizes', 'sourceURLs', 'upc', 'weight']
number of reviews: (3076, 32)
unique items: 358
unique reviewers: 1466
Sparsity: 0.586%
beers with fewer than 2 reviews: 8.94%
users with fewer than 2 reviews: 26.13%

id completeness: 100.0%
asins completeness: 43.27%
brand completeness: 97.887%
categories completeness: 100.0%
dateAdded completeness: 100.0%
dateUpdated completeness: 100.0%
descriptions completeness: 92.425%
dimension completeness: 44.278%
ean completeness: 56.242%
flavors completeness: 8.81%
keys completeness: 100.0%
manufacturer comp

## 1.3 Beers and Breweries reviews
Pros
+ Many reviews

Mediums
* Temporal context data

Cons
- Feature data
- Implicit data
- Density

In [34]:
beers = pd.read_csv('../data-beers-breweries-reviews/beers.csv')
breweries = pd.read_csv('../data-beers-breweries-reviews/breweries.csv')
reviews = pd.read_csv('../data-beers-breweries-reviews/reviews.csv')
# reviews.head()

In [37]:
print(f"columns: {list(reviews.columns)}")
print(f"number of reviews: {reviews.shape}")
print(f"unique items: {len(pd.unique(reviews.beer_id))}")
print(f"unique reviewers: {len(pd.unique(reviews.username))}")
print(f"Sparsity: {round(100*reviews.shape[0] / (len(pd.unique(reviews.beer_id)) * len(pd.unique(reviews.username))), 3)}%")
n = 2
b_df = make_df_from_count(reviews.beer_id,'beer_id')
print(f"items with fewer than {n} reviews: {round(100*len(b_df[b_df['beer_id_count'] < n])/len(b_df), 2)}%")
u_df = make_df_from_count(reviews['username'], 'username')
print(f"users with fewer than {n} reviews: {round(100*len(u_df[u_df['username_count'] < n])/len(u_df), 2)}%\n")

for column in list(reviews.columns):
    print(f"{column} completeness: {round(100*(1-(reviews[column].isna().sum()/reviews.shape[0])),3)}%")

columns: ['beer_id', 'username', 'date', 'text', 'look', 'smell', 'taste', 'feel', 'overall', 'score']
number of reviews: (9073128, 10)
unique items: 309542
unique reviewers: 164935
Sparsity: 0.018%
beers with fewer than 2 reviews: 33.23%
users with fewer than 2 reviews: 31.64%

beer_id completeness: 100.0%
username completeness: 99.958%
date completeness: 100.0%
text completeness: 100.0%
look completeness: 58.228%
smell completeness: 58.228%
taste completeness: 58.228%
feel completeness: 58.228%
overall completeness: 58.228%
score completeness: 100.0%


## 1.4 Recipes reviews
Has had some preprocessing done already, PP_user stores details of which techniques a user has tried, which recipes they've tried and the ratings they gave.

Pros
+ Many reviews
+ Feature data

Mediums
* Temporal context data

Cons
- Implicit data
- Density

In [38]:
recipes = pd.read_csv('../data-recipes/RAW_recipes.csv')
recipe_reviews = pd.read_csv('../data-recipes/RAW_interactions.csv')
# pp_recipes = pd.read_csv('../data-recipes/PP_recipes.csv')
# pp_users = pd.read_csv('../data-recipes/PP_users.csv')

In [39]:
# recipes.head()
# recipe_reviews.head()

In [42]:
print(f"columns: {list(recipe_reviews.columns)}")
print(f"number of reviews: {recipe_reviews.shape}")
print(f"unique items: {len(pd.unique(recipe_reviews.recipe_id))}")
print(f"unique reviewers: {len(pd.unique(recipe_reviews.user_id))}")
print(f"Sparsity: {round(100*recipe_reviews.shape[0] / (len(pd.unique(recipe_reviews.recipe_id)) * len(pd.unique(recipe_reviews.user_id))), 3)}%")
n = 2
b_df = make_df_from_count(recipe_reviews.recipe_id,'recipe_id')
print(f"items with fewer than {n} reviews: {round(100*len(b_df[b_df['recipe_id_count'] < n])/len(b_df), 2)}%")
u_df = make_df_from_count(recipe_reviews['user_id'], 'user_id')
print(f"users with fewer than {n} reviews: {round(100*len(u_df[u_df['user_id_count'] < n])/len(u_df), 2)}%\n")

for column in list(recipes.columns):
    print(f"{column} completeness: {round(100*(1-(recipes[column].isna().sum()/recipes.shape[0])),3)}%")

columns: ['user_id', 'recipe_id', 'date', 'rating', 'review']
number of reviews: (1132367, 5)
unique items: 231637
unique reviewers: 226570
Sparsity: 0.002%
beers with fewer than 2 reviews: 39.7%
users with fewer than 2 reviews: 73.38%

name completeness: 100.0%
id completeness: 100.0%
minutes completeness: 100.0%
contributor_id completeness: 100.0%
submitted completeness: 100.0%
tags completeness: 100.0%
nutrition completeness: 100.0%
n_steps completeness: 100.0%
steps completeness: 100.0%
description completeness: 97.851%
ingredients completeness: 100.0%
n_ingredients completeness: 100.0%


## 1.5 Data wine reviews
Pros
+ Density
+ Many reviews
+ Only 20 users
+ Feature data

Cons
- Contextual data
- Implicit data

In [46]:
wine130 = pd.read_csv('../data-wine-reviews/winemag-data-130k-v2.csv')

In [52]:
# wine130.head()
# wine130[wine130['taster_twitter_handle'] == '@kerinokeefe']
# wine130[wine130['title'] == 'COS 2013 Frappato (Sicilia)']

In [53]:
print(f"columns: {list(wine130.columns)}")
print(f"number of reviews: {wine130.shape}")
print(f"unique items: {len(pd.unique(wine130.title))}")
print(f"unique reviewers: {len(pd.unique(wine130.taster_name))}")
print(f"Sparsity: {round(100*wine130.shape[0] / (len(pd.unique(wine130.title)) * len(pd.unique(wine130.taster_name))),3)}%")
n = 2
b_df = make_df_from_count(wine130.title,'title')
print(f"items with fewer than {n} reviews: {round(100*len(b_df[b_df['title_count'] < n])/len(b_df), 2)}%")
u_df = make_df_from_count(wine130['taster_name'], 'taster_name')
print(f"users with fewer than {n} reviews: {round(100*len(u_df[u_df['taster_name_count'] < n])/len(u_df), 2)}%\n")

for column in list(wine130.columns):
    print(f"{column} completeness: {round(100*(1-(wine130[column].isna().sum()/wine130.shape[0])),3)}%")

columns: ['Unnamed: 0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery']
number of reviews: (129971, 14)
unique items: 118840
unique reviewers: 20
Sparsity: 5.468%
items with fewer than 2 reviews: 90.97%
users with fewer than 2 reviews: 0.0%

Unnamed: 0 completeness: 100.0%
country completeness: 99.952%
description completeness: 100.0%
designation completeness: 71.174%
points completeness: 100.0%
price completeness: 93.078%
province completeness: 99.952%
region_1 completeness: 83.653%
region_2 completeness: 38.863%
taster_name completeness: 79.808%
taster_twitter_handle completeness: 75.985%
title completeness: 100.0%
variety completeness: 99.999%
winery completeness: 100.0%


## 1.6 Yelp reviews
Because there are so many i've used ```$ head -1500000 yelp_academic_dataset_review.json > review.json``` to cut down the memory requirements. I then run make_valid to tidy this up.

Pros
+ Many reviews
+ Feature data

Mediums
* Context data (temporal)

Cons
- Implicit data
- A bit sparse


In [16]:
# yelp_business = pd.read_json('../data-yelp/yelp_dataset/yelp_academic_dataset_business.json')
# yelp_user = pd.read_json('data-yelp/yelp_dataset/yelp_academic_dataset_user.json')
# yelp_reviews = pd.read_json('data-yelp/yelp_dataset/yelp_academic_dataset_review.json')

In [9]:
def make_valid(json_file):
    with open(json_file) as f:
        strs = f.readlines()
    strs = [s[:-1]+",\n" for s in strs]
    strs[-1] = strs[-1][:-2]
    with open(json_file, 'w') as f:
        f.write('[\n')
        f.writelines(strs)
        f.write('\n]')

# make_valid('../data-yelp/yelp_dataset/review.json')
# make_valid('../data-yelp/yelp_dataset/business.json')

In [12]:
yelp_reviews = pd.read_json('../data-yelp/yelp_dataset/review.json')
yelp_business = pd.read_json('../data-yelp/yelp_dataset/business.json')

In [13]:
yelp_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


In [14]:
print(f"columns: {list(yelp_reviews.columns)}")
print(f"number of reviews: {yelp_reviews.shape[0]}")
print(f"unique items: {len(pd.unique(yelp_reviews.business_id))}")
print(f"unique reviewers: {len(pd.unique(yelp_reviews.user_id))}")
print(f"Sparsity: {round(100*yelp_reviews.shape[0] / (len(pd.unique(yelp_reviews.business_id)) * len(pd.unique(yelp_reviews.user_id))),3)}%")
n = 2
b_df = make_df_from_count(yelp_reviews.business_id,'business_id')
print(f"items with fewer than {n} reviews: {round(100*len(b_df[b_df['business_id_count'] < n])/len(b_df), 2)}%")
u_df = make_df_from_count(yelp_reviews['user_id'], 'user_id')
print(f"users with fewer than {n} reviews: {round(100*len(u_df[u_df['user_id_count'] < n])/len(u_df), 2)}%\n")

for column in list(yelp_business.columns):
    print(f"{column} completeness: {round(100*(1-(yelp_business[column].isna().sum()/yelp_business.shape[0])),3)}%")

columns: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']
number of reviews: 1000000
unique items: 28085
unique reviewers: 551746
Sparsity: 0.006%
items with fewer than 2 reviews: 9.13%
users with fewer than 2 reviews: 71.97%

business_id completeness: 100.0%
name completeness: 100.0%
address completeness: 100.0%
city completeness: 100.0%
state completeness: 100.0%
postal_code completeness: 100.0%
latitude completeness: 100.0%
longitude completeness: 100.0%
stars completeness: 100.0%
review_count completeness: 100.0%
is_open completeness: 100.0%
attributes completeness: 90.664%
categories completeness: 99.928%
hours completeness: 82.974%
