In [1]:
import findspark
from ml_utils import *
findspark.init()
import sys
from pyspark import SparkContext

In [2]:
sc = SparkContext(appName="Top Dislike Growth")


#Relative path is used to specify the output directory
#The relative path is always relative to your home directory in HDFS: /user/<yourUserName>
output_path = "topDislikes"

videos = sc.textFile("AllVideos_short.csv")


In [3]:
def id_and_country(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    country = parts[11].strip()
    return video_id,country

def id_and_likes(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    likes = parts[6].strip()
    return video_id,likes

def id_and_dislikes(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    dislikes = parts[7].strip()
    return video_id,dislikes

def id_and_date(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    trending_date = parts[1].strip()
    return video_id,trending_date

def id_and_category(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    category = parts[3].strip()
    return video_id,category

def key_and_country(line):
    line = line.strip()
    parts = line.split(",")
    video_id = parts[0].strip()
    category = parts[3].strip()
    country = parts[11].strip()
    trending_date = parts[1].strip()
    likes = parts[6].strip()
    dislikes = parts[7].strip()
    key = video_id+":"+category+":"+country
    value = trending_date+":"+likes+":"+dislikes
    return key,value

In [4]:
header = videos.first()
videos=videos.filter(lambda line:line!=header)

vv=videos.map(key_and_country)

In [5]:
vv.collect()[0:10]

[('SbOwzAl9ZfQ:Entertainment:MX', '17.14.11:4182:361'),
 ('klOV6Xh-DnI:People & Blogs:MX', '17.14.11:271:174'),
 ('6L2ZF7Qzsbk:News & Politics:MX', '17.14.11:10105:266'),
 ('hcY52MFWMDM:News & Politics:MX', '17.14.11:378:171'),
 ('_OXDcGPVAa4:Howto & Style:MX', '17.14.11:57781:681'),
 ('Q9kK6NWZR1U:Music:MX', '17.14.11:506:67'),
 ('c9VTD3n_IDs:People & Blogs:MX', '17.14.11:2277:69'),
 ('XzULSsZYMRc:News & Politics:MX', '17.14.11:7745:659'),
 ('uijjYNtl_UM:Entertainment:MX', '17.14.11:20155:912'),
 ('cOJ68MQm2ac:Entertainment:MX', '17.14.11:83582:2194')]

In [6]:
def get_growth_rate(input):
    line = [i for i in input]
    diff=[0,0]
    if len(line)>1:
        for i in range(2):
            like = line[i].split(":")[1]
            dislike = line[i].split(":")[2]
            diff[i] = int(like) - int(dislike)
        growth_rate = diff[0]-diff[1]
        return growth_rate
    else:
        return 0
    
    

In [69]:
rdd=vv.groupByKey().mapValues(get_growth_rate)

In [81]:
rdd.collect()[:10]

[('6L2ZF7Qzsbk:News & Politics:MX', -7952),
 ('c9VTD3n_IDs:People & Blogs:MX', -1463),
 ('uijjYNtl_UM:Entertainment:MX', 0),
 ('kTT472QeJGg:People & Blogs:MX', 0),
 ('7jmJtdqI6YE:Entertainment:MX', -116),
 ('M16Usa8oqDg:News & Politics:MX', -144),
 ('7D-swI6qNLM:Entertainment:MX', 0),
 ('WflHonz04Uc:Entertainment:MX', 0),
 ('d1oYTRYmNHs:Comedy:MX', 0),
 ('Wi3nAYFigpQ:Autos & Vehicles:MX', 0)]

In [70]:
aggregated_result = rdd.sortBy(lambda a:a[1],0)

In [78]:
def order_items(line):
    
    rate = line[1]
    sub_parts = line[0].split(":")
    
    vid = sub_parts[0]
    category = sub_parts[1]
    country = sub_parts[2]
    
    key = vid
    value = str(rate)+", "+category+", "+country
    return key, value

In [79]:
result = aggregated_result.map(order_items)

In [73]:
aggregated_result.collect()

[('BEePFpC9qG8:Film & Animation:DE', 366556),
 ('RmZ3DPJQo2k:Music:KR', 334594),
 ('1Aoc-cd9eYs:Entertainment:GB', 192222),
 ('QwZT7T-TXT0:Entertainment:US', 189608),
 ('QwZT7T-TXT0:Entertainment:GB', 189605),
 ('PfLCyR6Efvw:Music:GB', 106418),
 ('ZGEoqPpJQLE:Music:DE', 98934),
 ('ZGEoqPpJQLE:Music:RU', 98930),
 ('84LBjXaeKk4:Entertainment:FR', 93961),
 ('84LBjXaeKk4:Entertainment:RU', 93961),
 ('84LBjXaeKk4:Entertainment:DE', 93959),
 ('aqoPG-fYcXs:Entertainment:DE', 63985),
 ('5d0lsUiM0zo:People & Blogs:MX', 63912),
 ('Cd0EbzegNH0:Nonprofits & Activism:RU', 58201),
 ('8fVXPaVabxM:People & Blogs:RU', 57465),
 ('8I_NkJ8VTEI:People & Blogs:US', 39219),
 ('pOHQdIDds6s:People & Blogs:DE', 37480),
 ('yof_rbxu8Nc:News & Politics:RU', 36309),
 ('V5cOvyDpWfM:Sports:CA', 35994),
 ('V5cOvyDpWfM:Sports:GB', 35993),
 ('V5cOvyDpWfM:Sports:FR', 33596),
 ('V5cOvyDpWfM:Sports:DE', 33595),
 ('B0ry4CFZwfQ:People & Blogs:RU', 33502),
 ('RA6PXJIlCIk:People & Blogs:RU', 33362),
 ('uJb1zN2pGw4:Nonprofits &

In [80]:
result.collect()

[('BEePFpC9qG8', '366556, Film & Animation, DE'),
 ('RmZ3DPJQo2k', '334594, Music, KR'),
 ('1Aoc-cd9eYs', '192222, Entertainment, GB'),
 ('QwZT7T-TXT0', '189608, Entertainment, US'),
 ('QwZT7T-TXT0', '189605, Entertainment, GB'),
 ('PfLCyR6Efvw', '106418, Music, GB'),
 ('ZGEoqPpJQLE', '98934, Music, DE'),
 ('ZGEoqPpJQLE', '98930, Music, RU'),
 ('84LBjXaeKk4', '93961, Entertainment, FR'),
 ('84LBjXaeKk4', '93961, Entertainment, RU'),
 ('84LBjXaeKk4', '93959, Entertainment, DE'),
 ('aqoPG-fYcXs', '63985, Entertainment, DE'),
 ('5d0lsUiM0zo', '63912, People & Blogs, MX'),
 ('Cd0EbzegNH0', '58201, Nonprofits & Activism, RU'),
 ('8fVXPaVabxM', '57465, People & Blogs, RU'),
 ('8I_NkJ8VTEI', '39219, People & Blogs, US'),
 ('pOHQdIDds6s', '37480, People & Blogs, DE'),
 ('yof_rbxu8Nc', '36309, News & Politics, RU'),
 ('V5cOvyDpWfM', '35994, Sports, CA'),
 ('V5cOvyDpWfM', '35993, Sports, GB'),
 ('V5cOvyDpWfM', '33596, Sports, FR'),
 ('V5cOvyDpWfM', '33595, Sports, DE'),
 ('B0ry4CFZwfQ', '33502, 

In [13]:
sc.stop()