In [1]:
import pandas as pd

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
spark = SparkSession.builder.getOrCreate()

In [18]:
DATA_FOLDER = 'data/'
RATINGS_KINDLE = 'ratings_Kindle_Store.csv'
REVIEWS_KINDLE = 'Kindle_Store_5.json'
RATINGS_BOOKS = 'ratings_Books.csv'
REVIEWS_BOOKS = 'Books_5.json'

RATINGS_SCHEMA = StructType([
    StructField("User", StringType(), True),
    StructField("Asin", IntegerType(), True),
    StructField("Ratings", FloatType(), True),
    StructField("Timestamp", IntegerType(), True)])

RATINGS_USER = 0
RATINGS_ASIN = 1
RATINGS_RATINGS = 2
RATINGS_TIMESTAMP = 3

In [3]:
df_ratings_kindle = spark.read.csv(DATA_FOLDER + RATINGS_KINDLE, header = False, schema = RATINGS_SCHEMA)
df_ratings_kindle.head(5)

[Row(User=u'A2GZ9GFZV1LWB0', Asin=1603420304, Ratings=4.0, Timestamp=1405209600),
 Row(User=u'A1K7VSUDCVAPW8', Asin=1603420304, Ratings=3.0, Timestamp=1282176000),
 Row(User=u'A35J5XRE5ZT6H2', Asin=1603420304, Ratings=4.0, Timestamp=1365206400),
 Row(User=u'A3DGZNFSMNWSX5', Asin=1603420304, Ratings=4.0, Timestamp=1285632000),
 Row(User=u'A2CVDQ6H36L4VL', Asin=1603420304, Ratings=5.0, Timestamp=1342396800)]

In [4]:
df_reviews_kindle = spark.read.json(DATA_FOLDER + REVIEWS_KINDLE)
df_reviews_kindle.head(1)

[Row(asin=u'B000F83SZQ', helpful=[0, 0], overall=5.0, reviewText=u"I enjoy vintage books and movies so I enjoyed reading this book.  The plot was unusual.  Don't think killing someone in self-defense but leaving the scene and the body without notifying the police or hitting someone in the jaw to knock them out would wash today.Still it was a good read for me.", reviewTime=u'05 5, 2014', reviewerID=u'A1F6404F1VG29J', reviewerName=u'Avidreader', summary=u'Nice vintage story', unixReviewTime=1399248000)]

In [5]:
df_ratings_books = spark.read.csv(DATA_FOLDER + RATINGS_BOOKS, header = False, schema = RATINGS_SCHEMA)
df_ratings_books.head(5)

[Row(User=u'AH2L9G3DQHHAJ', Asin=116, Ratings=4.0, Timestamp=1019865600),
 Row(User=u'A2IIIDRK3PRRZY', Asin=116, Ratings=1.0, Timestamp=1395619200),
 Row(User=u'A1TADCM7YWPQ8M', Asin=868, Ratings=4.0, Timestamp=1031702400),
 Row(User=u'AWGH7V0BDOJKB', Asin=13714, Ratings=4.0, Timestamp=1383177600),
 Row(User=u'A3UTQPQPM4TQO0', Asin=13714, Ratings=5.0, Timestamp=1374883200)]

In [6]:
df_reviews_books = spark.read.json(DATA_FOLDER + REVIEWS_BOOKS)
df_reviews_books.head(1)

[Row(asin=u'000100039X', helpful=[0, 0], overall=5.0, reviewText=u'Spiritually and mentally inspiring! A book that allows you to question your morals and will help you discover who you really are!', reviewTime=u'12 16, 2012', reviewerID=u'A10000012B7CGYKOMPQ4L', reviewerName=u'Adam', summary=u'Wonderful!', unixReviewTime=1355616000)]

## Basic statistics
Compute basic statistics on both datasets to verify if any difference exists between the two (very different means of the ratings would mean that one is generally prefered to the other, and very different variances would mean that some opinions on one of the support is not as unilateral as the other).

In [46]:
# Averages

# Books
books_ratings = df_ratings_books.select('Ratings').rdd.map(lambda x : x[0]).persist()
books_nb_sample = books_ratings.count()
books_average = books_ratings.mean()
books_variance = books_ratings.variance()
books_ratings.unpersist()

# Kindle
kindle_ratings = df_ratings_kindle.select('Ratings').rdd.map(lambda x : x[0]).persist()
kindle_nb_sample = kindle_ratings.count()
kindle_average = kindle_ratings.mean()
kindle_variance = kindle_ratings.variance()
kindle_ratings.unpersist()

# Print results
print("-- Books -- Count : {}, Mean : {}, Variance : {}".format(books_nb_sample, books_average, books_variance))
print("-- Kindle -- Count : {}, Mean : {}, Variance : {}".format(kindle_nb_sample, kindle_average, kindle_variance))

-- Books -- Count : 22507155, Mean : 4.29575892644, Variance : 1.23544712428
-- Kindle -- Count : 3205467, Mean : 4.23210689737, Variance : 1.28548661246


### Result
As expected, there is no differences in the basic statistics. The means are similar as well as the variances. This means that the two supports are appreciated the same way.