# 02 - Data Sampling

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[4]").appName("main").getOrCreate()
from pyspark.sql.functions import col

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set(style='whitegrid')
% matplotlib inline

In [2]:
data_path = os.path.join('..','..','data')
data_file = os.path.join(data_path, 'reviews.json')
sdf = spark.read.json(data_path)
sdf.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



I changed the sampling methodology a bit. It turns out we can't just filter movies and users just once because when we remove users, that changes the number of reviews per movie, and vice versa. So, we have to iteratetively filter to convergence. I initially wrote make_sample for spark dataframes, but it turns out that running this locally is quite slow, so I wrote an identical implementation for pandas, which was lightning fast! So for data sampling, I just did everything in pandas but left the spark code here anyways.

In [99]:
# spark version
def make_sample_sdf(in_df, in_prop, in_threshold=5, in_seed=1):

    min_reviews_per_movie = 0
    min_reviews_per_user = 0
    n_iterations = 1
    sample_df = in_df.sample(False, in_prop, in_seed)
    n_samples = sample_df.count()
    print 'Number of initial samples:', n_samples
    
    while min_reviews_per_movie < in_threshold and min_reviews_per_user < in_threshold and sample_df.count() > 0:
        print 'Iteration:', n_iterations
    
        sample_item_counts = sample_df.groupby('asin').count()
        sample_item_counts.cache()
        items_geq = sample_item_counts.filter(sample_item_counts['count'] >= in_threshold)
        items_geq.cache()
        n_items = items_geq.count()
        print '\tNumber of items with at least {} reviews:'.format(in_threshold), n_items 
        min_reviews_per_movie = sample_item_counts.select('count').rdd.map(lambda x: x['count'])\
            .reduce(lambda x, y: min(x, y))
        print '\tMin reviews per movie:', min_reviews_per_movie
        
        sample_user_counts = sample_df.groupby('reviewerID').count()
        sample_user_counts.cache()
        users_geq = sample_user_counts.filter(sample_user_counts['count'] >= in_threshold)
        users_geq.cache()
        n_users = users_geq.count()
        print '\tNumber of users with at least {} reviews:'.format(in_threshold), n_users
        min_reviews_per_user = sample_user_counts.select('count').rdd.map(lambda x: x['count'])\
            .reduce(lambda x, y: min(x, y))
        print '\tMin reviews per user:', min_reviews_per_user

        asin_list = items_geq.select('asin').collect()
        asin_list = [x['asin'].encode() for x in asin_list]
        #print('asin_list:', len(asin_list))
    
        user_list = users_geq.select('reviewerID').collect()
        user_list = [x['reviewerID'].encode() for x in user_list]
        #print('user_list:', len(user_list))
        
        sample_df = sample_df.filter(sample_df['asin'].isin(asin_list) & sample_df['reviewerID'].isin(user_list))
        #in_df.show()
        print '\tNumber of final samples:', sample_df.count()
        
        n_iterations += 1
        
    print 'Density:', sample_df.count() / float(n_users * n_items)
    return sample_df

# pandas version
def make_sample_pdf(in_df, in_prop, in_threshold=5, in_seed=1):

    min_reviews_per_movie = 0
    min_reviews_per_user = 0
    n_iterations = 1
    sample_df = in_df.sample(frac = in_prop, random_state = in_seed)
    n_samples = sample_df.shape[0]
    print 'Number of initial samples:', n_samples
    
    while min_reviews_per_movie < in_threshold and min_reviews_per_user < in_threshold and len(sample_df) > 0:
        print 'Iteration:', n_iterations
    
        sample_item_counts = sample_df.groupby('asin').count()
        items_geq = sample_item_counts[sample_item_counts['overall'] >= in_threshold]
        n_items = items_geq.shape[0]
        print '\tNumber of items with at least {} reviews:'.format(in_threshold), n_items 
        min_reviews_per_movie = sample_item_counts['overall'].min()
        print '\tMin reviews per movie:', min_reviews_per_movie
        
        sample_user_counts = sample_df.groupby('reviewerID').count()
        users_geq = sample_user_counts[sample_user_counts['overall'] >= in_threshold]
        n_users = users_geq.shape[0]
        print '\tNumber of users with at least {} reviews:'.format(in_threshold), n_users
        min_reviews_per_user = sample_user_counts['overall'].min()
        print '\tMin reviews per user:', min_reviews_per_user

        asin_list = items_geq.index
        user_list = users_geq.index
        
        sample_df = sample_df[sample_df['asin'].isin(asin_list) & sample_df['reviewerID'].isin(user_list)]
        #in_df.show()
        print '\tNumber of final samples:', sample_df.shape[0]
        
        n_iterations += 1
        
    print 'Density:', sample_df.shape[0] / float(n_items * n_users)
    return sample_df

In [8]:
pdf = sdf.select(*('asin', 'reviewerID', 'overall', 'reviewTime')).toPandas()

# Creating samples
The problem with using smaller samples of data while maintaining the density of the original data is simply that the original data is too sparse. This puts a lower bound on how small we can make our sampled data, so a solution was to make the original dataset more dense (i.e. require that each movie has at least $k$ reviews and each user made at least $k$ reviews for $k \geq 5$). In the make_sample function, I call $k$ the threshold. Basically, if we decide first what proportion of the data we want to sample, we can then adjust the threshold to control the density of the sample. The parameters I choose and the resulting densities were:

|Dataset|% Sampled|$k$|Density|
|---|---|---|---|
|pdf_sample_100|100|20|0.008|
|pdf_sample_50|50|13|0.007|
|pdf_sample_25|25|9|0.007|
|pdf_sample_10|10|5|0.005|


In [100]:
pdf_sample_100 = make_sample_pdf(pdf, 1.0, 20)

Number of initial samples: 1697533
Iteration: 1
	Number of items with at least 20 reviews: 17798
	Min reviews per movie: 5
	Number of users with at least 20 reviews: 14176
	Min reviews per user: 5
	Number of final samples: 673342
Iteration: 2
	Number of items with at least 20 reviews: 9477
	Min reviews per movie: 1
	Number of users with at least 20 reviews: 10909
	Min reviews per user: 1
	Number of final samples: 533762
Iteration: 3
	Number of items with at least 20 reviews: 8624
	Min reviews per movie: 8
	Number of users with at least 20 reviews: 8515
	Min reviews per user: 1
	Number of final samples: 482382
Iteration: 4
	Number of items with at least 20 reviews: 7777
	Min reviews per movie: 10
	Number of users with at least 20 reviews: 8173
	Min reviews per user: 14
	Number of final samples: 461668
Iteration: 5
	Number of items with at least 20 reviews: 7637
	Min reviews per movie: 15
	Number of users with at least 20 reviews: 7738
	Min reviews per user: 5
	Number of final samples: 4

In [101]:
pdf_sample_50 = make_sample_pdf(pdf, 0.5, 13)

Number of initial samples: 848767
Iteration: 1
	Number of items with at least 13 reviews: 14416
	Min reviews per movie: 1
	Number of users with at least 13 reviews: 10344
	Min reviews per user: 1
	Number of final samples: 289873
Iteration: 2
	Number of items with at least 13 reviews: 6870
	Min reviews per movie: 1
	Number of users with at least 13 reviews: 7396
	Min reviews per user: 1
	Number of final samples: 215070
Iteration: 3
	Number of items with at least 13 reviews: 6107
	Min reviews per movie: 5
	Number of users with at least 13 reviews: 5495
	Min reviews per user: 1
	Number of final samples: 189091
Iteration: 4
	Number of items with at least 13 reviews: 5441
	Min reviews per movie: 6
	Number of users with at least 13 reviews: 5206
	Min reviews per user: 9
	Number of final samples: 178625
Iteration: 5
	Number of items with at least 13 reviews: 5342
	Min reviews per movie: 9
	Number of users with at least 13 reviews: 4917
	Min reviews per user: 6
	Number of final samples: 174154

In [102]:
pdf_sample_25 = make_sample_pdf(pdf, 0.25, 9)

Number of initial samples: 424383
Iteration: 1
	Number of items with at least 9 reviews: 11096
	Min reviews per movie: 1
	Number of users with at least 9 reviews: 7005
	Min reviews per user: 1
	Number of final samples: 119752
Iteration: 2
	Number of items with at least 9 reviews: 4589
	Min reviews per movie: 1
	Number of users with at least 9 reviews: 4604
	Min reviews per user: 1
	Number of final samples: 81005
Iteration: 3
	Number of items with at least 9 reviews: 3952
	Min reviews per movie: 3
	Number of users with at least 9 reviews: 3194
	Min reviews per user: 1
	Number of final samples: 67979
Iteration: 4
	Number of items with at least 9 reviews: 3416
	Min reviews per movie: 3
	Number of users with at least 9 reviews: 2974
	Min reviews per user: 5
	Number of final samples: 62490
Iteration: 5
	Number of items with at least 9 reviews: 3325
	Min reviews per movie: 6
	Number of users with at least 9 reviews: 2721
	Min reviews per user: 4
	Number of final samples: 59862
Iteration: 6
	

In [103]:
pdf_sample_10 = make_sample_pdf(pdf, 0.10, 5)

Number of initial samples: 169753
Iteration: 1
	Number of items with at least 5 reviews: 8830
	Min reviews per movie: 1
	Number of users with at least 5 reviews: 5379
	Min reviews per user: 1
	Number of final samples: 42870
Iteration: 2
	Number of items with at least 5 reviews: 3288
	Min reviews per movie: 1
	Number of users with at least 5 reviews: 3289
	Min reviews per user: 1
	Number of final samples: 26855
Iteration: 3
	Number of items with at least 5 reviews: 2791
	Min reviews per movie: 1
	Number of users with at least 5 reviews: 2119
	Min reviews per user: 1
	Number of final samples: 21653
Iteration: 4
	Number of items with at least 5 reviews: 2286
	Min reviews per movie: 1
	Number of users with at least 5 reviews: 1952
	Min reviews per user: 2
	Number of final samples: 19249
Iteration: 5
	Number of items with at least 5 reviews: 2194
	Min reviews per movie: 3
	Number of users with at least 5 reviews: 1753
	Min reviews per user: 2
	Number of final samples: 18156
Iteration: 6
	Nu

Finally, all samples are saved as CSV files so that they can be easily converted back to spark dataframes in the next notebook.

In [104]:
pdf_sample_10.to_csv(os.path.join(data_path, 'reviews_sample_10.csv'))
pdf_sample_25.to_csv(os.path.join(data_path, 'reviews_sample_25.csv'))
pdf_sample_50.to_csv(os.path.join(data_path, 'reviews_sample_50.csv'))
pdf_sample_100.to_csv(os.path.join(data_path, 'reviews_sample_100.csv'))