In [1]:
import findspark
from ml_utils import *
findspark.init()
import sys
from pyspark import SparkContext

In [2]:
sc = SparkContext(appName="Top Dislike Growth")


#Relative path is used to specify the output directory
#The relative path is always relative to your home directory in HDFS: /user/<yourUserName>
output_path = "topDislikes"

videos = sc.textFile("AllVideos_short.csv")


In [40]:
def id_and_country(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    country = parts[11].strip()
    return video_id,country

def id_and_likes(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    likes = parts[6].strip()
    return video_id,likes

def id_and_dislikes(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    dislikes = parts[7].strip()
    return video_id,dislikes

def id_and_date(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    trending_date = parts[1].strip()
    return video_id,trending_date

def id_and_category(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    category = parts[3].strip()
    return video_id,category

def key_and_country(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    category = parts[3].strip()
    country = parts[11].strip()
    trending_date = parts[1].strip()
    likes = parts[6].strip()
    dislikes = parts[7].strip()
    key = video_id+":"+category+":"+country
    value = trending_date+":"+likes+":"+dislikes
    return key,value

In [45]:
header = videos.first()
videos=videos.filter(lambda line:line!=header)

vv=videos.map(key_and_country)

In [46]:
vv.collect()[0:10]

[('SbOwzAl9ZfQ:Entertainment:MX', '17.14.11:4182:361'),
 ('klOV6Xh-DnI:People & Blogs:MX', '17.14.11:271:174'),
 ('6L2ZF7Qzsbk:News & Politics:MX', '17.14.11:10105:266'),
 ('hcY52MFWMDM:News & Politics:MX', '17.14.11:378:171'),
 ('_OXDcGPVAa4:Howto & Style:MX', '17.14.11:57781:681'),
 ('Q9kK6NWZR1U:Music:MX', '17.14.11:506:67'),
 ('c9VTD3n_IDs:People & Blogs:MX', '17.14.11:2277:69'),
 ('XzULSsZYMRc:News & Politics:MX', '17.14.11:7745:659'),
 ('uijjYNtl_UM:Entertainment:MX', '17.14.11:20155:912'),
 ('cOJ68MQm2ac:Entertainment:MX', '17.14.11:83582:2194')]

In [62]:
def get_growth_rate(input):
    line = [i for i in input]
    diff=[0,0]
    if len(line)>1:
        for i in range(2):
            like = line[i].split(":")[1]
            dislike = line[i].split(":")[2]
            diff[i] = int(like) - int(dislike)
        growth_rate = diff[0]-diff[1]
        return growth_rate
    else:
        return 0
    
    

In [63]:
rdd=vv.groupByKey().mapValues(get_growth_rate)

In [64]:
rdd.collect()

[('6L2ZF7Qzsbk:News & Politics:MX', -7952),
 ('c9VTD3n_IDs:People & Blogs:MX', -1463),
 ('uijjYNtl_UM:Entertainment:MX', 0),
 ('kTT472QeJGg:People & Blogs:MX', 0),
 ('7jmJtdqI6YE:Entertainment:MX', -116),
 ('M16Usa8oqDg:News & Politics:MX', -144),
 ('7D-swI6qNLM:Entertainment:MX', 0),
 ('WflHonz04Uc:Entertainment:MX', 0),
 ('d1oYTRYmNHs:Comedy:MX', 0),
 ('Wi3nAYFigpQ:Autos & Vehicles:MX', 0),
 ('3RITF3vm-KE:Entertainment:MX', 0),
 ('1i71glG1P5s:News & Politics:MX', 0),
 ('6YHkNTRMUL8:News & Politics:MX', 0),
 ('1ZAPwfrtAFY:Entertainment:MX', 0),
 ('O1v0F5-Xv68:Entertainment:MX', 0),
 ('RkTi5_hu-kY:Entertainment:MX', 0),
 ('b_U9i0vSZnQ:Entertainment:MX', 0),
 ('GZGdFRttWDI:Film & Animation:MX', -9088),
 ('lHJLYWwzphw:News & Politics:MX', 0),
 ('cgYhR-3QfhM:People & Blogs:MX', 0),
 ('O_jDNNEvvJo:People & Blogs:MX', 0),
 ('p-EupRWaX9E:Entertainment:MX', 0),
 ('FhuxIRzs8LE:Entertainment:MX', 0),
 ('zexzfD25jWU:Sports:MX', 0),
 ('so0ccfs-psI:Music:MX', 0),
 ('tU3b7GVb65E:Entertainment:MX', 

In [4]:
idNcountry = videos.map(id_and_country)
idNlikes = videos.map(id_and_likes)
idNdislikes = videos.map(id_and_dislikes)
idNdates = videos.map(id_and_date)
idNcategory = videos.map(id_and_category)

In [6]:
likesNdislikes = idNlikes.join(idNdislikes)

In [19]:
idNcountry.collect()[:10]

[('video_id', 'country'),
 ('SbOwzAl9ZfQ', 'MX'),
 ('klOV6Xh-DnI', 'MX'),
 ('6L2ZF7Qzsbk', 'MX'),
 ('hcY52MFWMDM', 'MX'),
 ('_OXDcGPVAa4', 'MX'),
 ('Q9kK6NWZR1U', 'MX'),
 ('c9VTD3n_IDs', 'MX'),
 ('XzULSsZYMRc', 'MX'),
 ('uijjYNtl_UM', 'MX')]

In [13]:
sc.stop()