In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 6
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
ls -lh /users/trush/CSC496/DataParallelComputing/data/ml-latest

total 1.2G
-rw-r--r-- 1 trush PDC-edu-Lab 396M Sep 20 08:05 genome-scores.csv
-rw-r--r-- 1 trush PDC-edu-Lab  18K Sep 20 08:03 genome-tags.csv
-rw-r--r-- 1 trush PDC-edu-Lab 1.3M Sep 20 08:03 links.csv
-rw-r--r-- 1 trush PDC-edu-Lab 2.8M Sep 20 08:03 movies.csv
-rw-r--r-- 1 trush PDC-edu-Lab 725M Sep 20 08:06 ratings.csv
-rw-r--r-- 1 trush PDC-edu-Lab 9.6K Sep 20 08:03 README.txt
-rw-r--r-- 1 trush PDC-edu-Lab  38M Sep 20 08:03 tags.csv


In [4]:
!ls

data  spark-2.ipynb


In [5]:
ratings = sc.textFile("/users/trush/CSC496/DataParallelComputing/data/ml-latest/ratings.csv")

In [6]:
ratings.take(5)

['userId,movieId,rating,timestamp',
 '1,307,3.5,1256677221',
 '1,481,3.5,1256677456',
 '1,1091,1.5,1256677471',
 '1,1257,4.5,1256677460']

In [7]:
ratings.cache()

/users/trush/CSC496/DataParallelComputing/data/ml-latest/ratings.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [8]:
%%time
ratings.count()

CPU times: user 32.3 ms, sys: 3.95 ms, total: 36.3 ms
Wall time: 14.2 s


27753445

In [9]:
%%time
ratings.count()

CPU times: user 17.6 ms, sys: 753 µs, total: 18.3 ms
Wall time: 8.14 s


27753445

In [10]:
%%time
ratings.count()

CPU times: user 10.4 ms, sys: 7.91 ms, total: 18.3 ms
Wall time: 8.07 s


27753445

# Average ratings over the years of every movie

In [11]:
ratingHeader = ratings.first()
print(ratingHeader)

userId,movieId,rating,timestamp


In [12]:
ratingsOnly = ratings.filter(lambda x : x != ratingHeader)
ratingsOnly.take(5)

['1,307,3.5,1256677221',
 '1,481,3.5,1256677456',
 '1,1091,1.5,1256677471',
 '1,1257,4.5,1256677460',
 '1,1449,4.5,1256677264']

In [13]:
s = '1,307,3.5,1256677221'
s.split(",")[2]

'3.5'

In [14]:
movieRatings = ratingsOnly.map(lambda line : (line.split(",")[1], float(line.split(",")[2])))
movieRatings.take(5)

[('307', 3.5), ('481', 3.5), ('1091', 1.5), ('1257', 4.5), ('1449', 4.5)]

## Possible approaches in aggregating data
 - groupByKey and mapValues
 - reduceByKey and countByKey

_groupByKey_ and _mapValues_

In [15]:
groupByKeyRatings = movieRatings.groupByKey()
#groupByKeyRatings
groupByKeyRatings.take(5)

[('3826', <pyspark.resultiterable.ResultIterable at 0x7f9213bac160>),
 ('104', <pyspark.resultiterable.ResultIterable at 0x7f9213bace48>),
 ('153', <pyspark.resultiterable.ResultIterable at 0x7f9213bac198>),
 ('165', <pyspark.resultiterable.ResultIterable at 0x7f9213bace10>),
 ('181', <pyspark.resultiterable.ResultIterable at 0x7f9213bac2b0>)]

In [16]:
mapValuesToListRatings = groupByKeyRatings.mapValues(list)
#mapValuesToListRatings
mapValuesToListRatings.take(5)

[('3826',
  [2.0,
   3.0,
   3.0,
   3.0,
   4.0,
   3.0,
   1.0,
   3.0,
   3.0,
   1.0,
   2.0,
   2.0,
   3.0,
   2.0,
   3.0,
   3.0,
   2.0,
   2.0,
   2.0,
   0.5,
   3.0,
   3.0,
   1.0,
   1.5,
   2.0,
   4.0,
   2.0,
   3.0,
   1.0,
   4.0,
   3.5,
   2.0,
   2.5,
   2.0,
   3.5,
   2.0,
   2.5,
   2.0,
   3.5,
   3.0,
   0.5,
   1.5,
   1.5,
   1.0,
   1.0,
   2.5,
   1.5,
   1.0,
   4.0,
   0.5,
   2.0,
   2.5,
   3.5,
   2.0,
   0.5,
   1.0,
   2.0,
   2.0,
   2.0,
   2.5,
   2.0,
   2.0,
   3.0,
   3.0,
   3.5,
   3.5,
   2.0,
   3.0,
   0.5,
   2.0,
   1.0,
   2.0,
   2.5,
   2.5,
   1.0,
   1.5,
   3.0,
   3.0,
   3.0,
   2.0,
   2.5,
   3.0,
   3.5,
   2.0,
   4.0,
   0.5,
   4.0,
   2.5,
   3.0,
   4.5,
   4.0,
   2.5,
   2.0,
   3.0,
   2.5,
   3.0,
   2.5,
   2.0,
   2.5,
   2.0,
   2.0,
   2.0,
   5.0,
   4.0,
   3.0,
   2.0,
   1.0,
   3.0,
   2.5,
   1.0,
   1.5,
   2.5,
   1.5,
   1.0,
   3.0,
   3.0,
   4.0,
   3.0,
   3.0,
   3.0,
   2.5,
   4.5,
   2.5,
   3.0

In [17]:
avgRatings01 = mapValuesToListRatings.mapValues(lambda V: sum(V) / float(len(V)))
#avgRatings01
avgRatings01.take(5)

[('3826', 2.5461339626882444),
 ('104', 3.3942778801636178),
 ('153', 2.9020622558025204),
 ('165', 3.5077587042963825),
 ('181', 1.9860046651116294)]

In [18]:
test = [2.0, 4.0, 3.0]
sum(test) / len(test)

3.0

_reduceByKey_ and _countByKey_

In [19]:
countsByKey = movieRatings.countByKey()
countsByKey

defaultdict(int,
            {'307': 7958,
             '481': 6037,
             '1091': 6138,
             '1257': 5902,
             '1449': 6867,
             '1590': 8511,
             '1591': 6508,
             '2134': 7020,
             '2478': 7797,
             '2840': 6047,
             '2986': 6060,
             '3020': 7783,
             '3424': 7265,
             '3698': 8269,
             '3826': 8898,
             '3893': 5259,
             '170': 9574,
             '849': 9878,
             '1186': 8643,
             '1235': 9937,
             '1244': 10249,
             '1296': 7470,
             '1663': 9581,
             '1962': 10219,
             '2108': 7729,
             '2243': 6398,
             '2352': 7517,
             '2707': 8198,
             '2746': 10142,
             '2915': 9590,
             '3363': 9001,
             '640': 3393,
             '828': 1736,
             '960': 173,
             '1221': 38875,
             '1321': 9175,
             '1

In [20]:
def sumValues(x, y):
    return (x + y)

In [21]:
sumRatings = movieRatings.reduceByKey(sumValues)
sumRatings.take(10)

[('3826', 22655.5),
 ('104', 75512.5),
 ('153', 112156.0),
 ('165', 144674.0),
 ('181', 5960.0),
 ('253', 109720.5),
 ('423', 7391.5),
 ('494', 48935.5),
 ('762', 28038.0),
 ('1396', 50437.5)]

In [22]:
avgRatings02 = sumRatings.map(lambda x: (x[0], x[1]/countsByKey.get(x[0])))
avgRatings02.take(10)

[('3826', 2.5461339626882444),
 ('104', 3.3942778801636178),
 ('153', 2.9020622558025204),
 ('165', 3.5077587042963825),
 ('181', 1.9860046651116294),
 ('253', 3.5052233084147977),
 ('423', 3.008343508343508),
 ('494', 3.3739313292884723),
 ('762', 2.414987080103359),
 ('1396', 3.6338256484149856)]

In [23]:
movies = sc.textFile("/users/trush/CSC496/DataParallelComputing/data/ml-latest/movies.csv")
movies.take(5)

['movieId,title,genres',
 '1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy',
 '2,Jumanji (1995),Adventure|Children|Fantasy',
 '3,Grumpier Old Men (1995),Comedy|Romance',
 '4,Waiting to Exhale (1995),Comedy|Drama|Romance']

In [24]:
movieHeader = movies.first()
movieInfo = movies.filter(lambda x: x != movieHeader)
movieInfo.take(5)

['1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy',
 '2,Jumanji (1995),Adventure|Children|Fantasy',
 '3,Grumpier Old Men (1995),Comedy|Romance',
 '4,Waiting to Exhale (1995),Comedy|Drama|Romance',
 '5,Father of the Bride Part II (1995),Comedy']

In [25]:
# rsplit handles movies with commas in the title
movieTitles = movieInfo.map(lambda line: (line.split(",",1)[0], line.rsplit(",",1)[0].split(",",1)[1]))
movieTitles.take(5)

[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)'),
 ('4', 'Waiting to Exhale (1995)'),
 ('5', 'Father of the Bride Part II (1995)')]

In [26]:
s = '1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy'
print(s.split(","))
print(s.rsplit(",",1)[0])
print(s.rsplit(",",1)[0].split(",",1)[1])

['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']
1,Toy Story (1995)
Toy Story (1995)


In [None]:
augmentedRatings = avgRatings01.join(movieTitles)
augmentedRatings.take(5)

In [None]:
augmentedRatings.takeOrdered(50, key = lambda x: -x[1][0])

## Challenge:
   report the average ratings for movies with ratings higher than 3.75 and the number of ratings total more than 100

In [None]:
# Number of Ratings per title
# countsByKey

# Average Rating per title
# avgRatings02

# Calculate avg after filter

avgRatings03 = sumRatings.map(lambda x: (x[0], x[1]/countsByKey.get(x[0]), countsByKey.get(x[0]))).filter(lambda x: x[1] > 3.75).filter(lambda x: x[2] > 100)
avgRatings03.take(5)