In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Movies")
sc = SparkContext(conf = conf) 

In [2]:
#Reading files
df_users = sc.textFile('./moviedata/users.csv')
df_zipcode = sc.textFile('./moviedata/zipcodes.csv')
df_rating = sc.textFile('./moviedata/rating.csv')

In [3]:
#Split string into list
df_users_columns = df_users.map(lambda x: x.split(","))
#map from userid to zipcode
userid_zipcode = df_users_columns.map(lambda x: (str(x[0]), str(x[4])))

In [4]:
userid_zipcode.take(5)

[('780', '"94560"'),
 ('781', '"48825"'),
 ('783', '"77081"'),
 ('784', '"91040"'),
 ('785', '"23322"')]

In [5]:
#Get a unique list of users who rated
rated_users = df_rating.map(lambda x : x.split(",")).map(lambda x : (str(x[0]), 2)).reduceByKey(lambda x,y : 1)

In [6]:
def mapreduce_join_util(x):
#     print x[0], x[1]
    first_d = []
    second_d = []
    for u, v in x[1]:
        if u == 1:
            first_d.append(v)
        else:
            second_d.append(v)
    return [(u, v) for u in first_d for v in second_d]
    

In [7]:
def mapreduce_join(rdd1, rdd2):
    rdd1_with_record_type = rdd1.map(lambda x: (x[0], [(1, x[1])]))
    rdd2_with_record_type = rdd2.map(lambda x: (x[0],[(2, x[1])]))
    combined_rdd = rdd1_with_record_type.union(rdd2_with_record_type).reduceByKey(lambda x,y : x+y)
    return combined_rdd.flatMap(mapreduce_join_util)

In [8]:
zipcode_part_usercnt = mapreduce_join(userid_zipcode, rated_users)

In [9]:
#sample
zipcode_part_usercnt.take(5)

[('"2215"', 1), ('"61401"', 1), ('"85719"', 1), ('"1970"', 1), ('"17870"', 1)]

In [10]:
#Find the number of voting users from each zipcode
zipcode_usercount = zipcode_part_usercnt.reduceByKey(lambda x, y: x+y)

In [11]:
#sample
zipcode_usercount.take(5)

[('"55414"', 9), ('"97302"', 1), ('"54901"', 1), ('"6260"', 1), ('"53711"', 2)]

In [12]:
#Split string
df_zipcode_columns = df_zipcode.map(lambda x: x.split(","))
#Get city-zipcode key-value pairs, x[0]-city, x[2]-zipcode
zipcode_city = df_zipcode_columns.map(lambda x: (str(x[0]), str(x[2])))

In [13]:
#sample
zipcode_city.take(4)

[('"2574"', '"WEST FALMOUTH"'),
 ('"1886"', '"WESTFORD"'),
 ('"1472"', '"WEST GROTON"'),
 ('"2671"', '"WEST HARWICH"')]

In [14]:
city_part_usercount = mapreduce_join(zipcode_city, zipcode_usercount)

In [15]:
city_part_usercount.take(5)

[('"BROOKLYN"', 1),
 ('"MOUNT CLEMENS"', 1),
 ('"SAN ANTONIO"', 1),
 ('"SAN FRANCISCO"', 1),
 ('"SEATTLE"', 1)]

In [16]:
city_usercount = city_part_usercount.reduceByKey(lambda x,y : x + y)

In [17]:
city_usercount.collect()

[('"WASILLA"', 1),
 ('"WALLINGFORD"', 1),
 ('"EL CAJON"', 1),
 ('"APEX"', 1),
 ('"LYNNFIELD"', 1),
 ('"TUCSON"', 4),
 ('"CUPERTINO"', 1),
 ('"FAIRFAX"', 1),
 ('"PARK FOREST"', 1),
 ('"VENTURA"', 1),
 ('"YPSILANTI"', 2),
 ('"RESEDA"', 1),
 ('"SARATOGA SPRINGS"', 1),
 ('"VALLEJO"', 1),
 ('"GILBERT"', 1),
 ('"NORTH CANTON"', 1),
 ('"SAINT PETERSBURG"', 1),
 ('"SCOTTSDALE"', 2),
 ('"WINTER PARK"', 1),
 ('"MARSHALL"', 1),
 ('"LITTLETON"', 3),
 ('"RIDGECREST"', 2),
 ('"WISCONSIN RAPIDS"', 1),
 ('"BEVERLY"', 1),
 ('"SALT LAKE CITY"', 4),
 ('"AUSTIN"', 8),
 ('"AVERA"', 1),
 ('"SUNLAND"', 1),
 ('"DELRAY BEACH"', 1),
 ('"ROSELLE PARK"', 1),
 ('"GOLETA"', 1),
 ('"SUNDERLAND"', 1),
 ('"ASHLAND"', 1),
 ('"PHOENIX"', 2),
 ('"DELAVAN"', 1),
 ('"GUILFORD"', 1),
 ('"RISING SUN"', 1),
 ('"ACTON"', 2),
 ('"DANA POINT"', 2),
 ('"TUCKAHOE"', 1),
 ('"COLUMBUS"', 9),
 ('"COLDWATER"', 1),
 ('"KIRKLAND"', 1),
 ('"UNION"', 1),
 ('"SAN RAMON"', 1),
 ('"OLYMPIA"', 1),
 ('"CROFTON"', 2),
 ('"EDEN PRAIRIE"', 2),
 (