In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("YoutubeAnalysis").master("local[1]").getOrCreate()

25/07/05 14:28:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/05 14:28:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/05 14:28:56 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/07/05 14:28:56 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [11]:
youtube_sch = StructType([\
                          StructField("Video id", StringType(), True),\
                          StructField("uploader", StringType(), True),\
                          StructField("Interval", IntegerType(), True),\
                          StructField("Category", StringType(), True),\
                          StructField("Length", IntegerType(), True),\
                          StructField("no_of_views", IntegerType(), True),\
                          StructField("Rating", DecimalType(), True),\
                          StructField("no_of_ratings", IntegerType(), True),\
                          StructField("no_of_comments", IntegerType(), True),\
                          StructField("Related_video_id", StringType(), True)])

In [14]:
youtube_txt = spark.read.text("certification/youtube/youtubedata.txt")

In [41]:
split_df = youtube_txt.select(split(col("value"), "\t+").alias("fields"))

In [45]:
final_df = split_df.select(
    col("fields").getItem(0).alias("Video_id"),
    col("fields").getItem(1).alias("uploader"),
    col("fields").getItem(2).cast("int").alias("Interval"),
    col("fields").getItem(3).alias("Category"),
    col("fields").getItem(4).cast("int").alias("Length"),
    col("fields").getItem(5).cast("int").alias("no_of_views"),
    col("fields").getItem(6).cast("decimal(3,2)").alias("Rating"),
    col("fields").getItem(7).cast("int").alias("no_of_ratings"),
    col("fields").getItem(8).cast("int").alias("no_of_comments"),

    slice(col("fields"), 10, 5000).alias("Related_video_id")
)

In [46]:
final_df.show(4)

+-----------+--------------------+--------+--------------+------+-----------+------+-------------+--------------+--------------------+
|   Video_id|            uploader|Interval|      Category|Length|no_of_views|Rating|no_of_ratings|no_of_comments|    Related_video_id|
+-----------+--------------------+--------+--------------+------+-----------+------+-------------+--------------+--------------------+
|QuRYeRnAuXM|EvilSquirrelPictures|    1135|Pets & Animals|   252|       1075|  4.96|           46|            86|[gFa1YMEJFag, nRc...|
|3TYqkBJ9YRk|              hggh22|    1135|        Comedy|   169|        228|  5.00|            5|             3|[QuRYeRnAuXM, gFa...|
|rSJ8QZWBegU|             TimeGem|    1135| Entertainment|    95|        356|  4.31|           13|             1|[QuRYeRnAuXM, gFa...|
|nRcovJn9xHg|          wooochacha|    1135| Entertainment|   118|       1115|  2.23|           57|            73|[QuRYeRnAuXM, gFa...|
+-----------+--------------------+--------+------------

### A. Find out the top 5 categories with maximum number of videos uploaded.

In [48]:
final_df.select("Category").groupBy("Category").agg(count("Category").alias("Count")).orderBy(col("Count").desc()).show(5)



+---------------+-----+
|       Category|Count|
+---------------+-----+
|  Entertainment|  908|
|          Music|  862|
|         Comedy|  414|
| People & Blogs|  398|
|News & Politics|  333|
+---------------+-----+
only showing top 5 rows



                                                                                

### B. Find out the top 10 rated videos.

In [63]:
final_df.where(final_df.Rating==5.00).orderBy(final_df.no_of_ratings.desc()).show(10)

+-----------+-----------------+--------+----------------+------+-----------+------+-------------+--------------+--------------------+
|   Video_id|         uploader|Interval|        Category|Length|no_of_views|Rating|no_of_ratings|no_of_comments|    Related_video_id|
+-----------+-----------------+--------+----------------+------+-----------+------+-------------+--------------+--------------------+
|Dl4km0v-P60|     all4tubekids|    1126| Travel & Events|   321|       8811|  5.00|          129|            63|[lj0UGs-0y6I, nbW...|
|nPEOr55j0Fs|         kat12923|    1010|           Music|   194|       5915|  5.00|           84|            24|[yx_Zs8CZPZQ, E86...|
|KQweSiiviVQ|          somedia|    1127|           Music|   200|       5422|  5.00|           65|            47|[aoDBacpCX34, xoD...|
|voGD_rriZPA|     janyanjanyan|    1030| News & Politics|   536|       1654|  5.00|           55|            17|                  []|
|jIuCA4RRtXE|          somedia|    1126|           Music|   20

### C. Find out the most viewed videos.

In [64]:
final_df.orderBy(final_df.no_of_views.desc()).show(10)

+-----------+---------------+--------+----------------+------+-----------+------+-------------+--------------+--------------------+
|   Video_id|       uploader|Interval|        Category|Length|no_of_views|Rating|no_of_ratings|no_of_comments|    Related_video_id|
+-----------+---------------+--------+----------------+------+-----------+------+-------------+--------------+--------------------+
|12Z3J1uzd0Q|        kaejane|     404|Film & Animation|   615|   65341925|  3.03|         9189|          5508|[innfyQZHPpo, -_C...|
|4DC4Rb9quKk|   ChrisBrownTV|     933|           Music|   265|   33754615|  4.84|        73257|         50036|[OqumjziPTzk, 1Hc...|
|LU8DDYz68kM|       Jason275|     807|  Pets & Animals|   503|   27721690|  4.88|        58850|         24004|[VryQDsx5Ad8, _Bt...|
|kHmvkRoEowc|itschriscrocker|     937|   Entertainment|   131|   18235463|  2.42|       122129|        259683|[h863nXDqCM0, hZA...|
|Md6rURKhZmA|    TPainVideos|     857|           Music|   240|   18141492|  