In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col,max,count,sum,mean,stddev_pop,hour,countDistinct,expr,stddev,window,column
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row
import math

In [2]:
# sc.stop()

# Initialization of SparkConf which is required for Spark Context
conf = SparkConf().setAppName('myapp').setMaster('local')

# Initialization of SparkContext
sc = SparkContext().getOrCreate(conf=conf)
sc.setLogLevel("OFF")

# Initialization of SparkSession into spark variable
spark = SparkSession(sc)

### You are asked to select only the following columns: "video_id", "trending_date", "title", "views", into a new dataframe.

In [3]:
dataframe = spark.read.csv('Video_Statistics.csv', header=True, inferSchema=True)
videoStats = dataframe.select("video_id", "trending_date", "title", "views")
# videoStats.show()
videoStats = dataframe["video_id", "trending_date", "title", "views"]
videoStats.show()

+--------+-------------+--------------------+-----+
|video_id|trending_date|               title|views|
+--------+-------------+--------------------+-----+
|       1|            4|  Goodbye First Love|  560|
|       2|            7|       Club Sandwich|  451|
|       3|           10|      Situation, The|  773|
|       4|            5|          Mine Games|  165|
|       5|            4|     Laws of Gravity|  559|
|       6|            3|Official Story, T...|  430|
|       7|           12|          Restaurant|  940|
|       8|            2|          Dead Birds|  613|
|       9|            2|         Signal, The|  759|
|      10|           11|You Were Never Lo...|  108|
|      11|            1|        Wholly Moses|  254|
|      12|            8|       Firehouse Dog|  316|
|      13|            2|        House of Wax|  858|
|      14|           11|Half Moon (a.k.a....|  934|
|      15|            4|        Spider-Man 3|  921|
|      16|           10|         Wall Street|  826|
|      17|  

### Now after selecting the columns we are interested in, we would like to create a new column inside "videoStats" dataframe with the name "new" that contains the views divided by 100.

In [4]:
# videoStats = videoStats.withColumn('new', col('views')/100)
# videoStats.show()
videoStats = videoStats.select("*",(col("Views")/100)).withColumnRenamed("(Views / 100)","New_Views")
videoStats.show()

+--------+-------------+--------------------+-----+---------+
|video_id|trending_date|               title|views|New_Views|
+--------+-------------+--------------------+-----+---------+
|       1|            4|  Goodbye First Love|  560|      5.6|
|       2|            7|       Club Sandwich|  451|     4.51|
|       3|           10|      Situation, The|  773|     7.73|
|       4|            5|          Mine Games|  165|     1.65|
|       5|            4|     Laws of Gravity|  559|     5.59|
|       6|            3|Official Story, T...|  430|      4.3|
|       7|           12|          Restaurant|  940|      9.4|
|       8|            2|          Dead Birds|  613|     6.13|
|       9|            2|         Signal, The|  759|     7.59|
|      10|           11|You Were Never Lo...|  108|     1.08|
|      11|            1|        Wholly Moses|  254|     2.54|
|      12|            8|       Firehouse Dog|  316|     3.16|
|      13|            2|        House of Wax|  858|     8.58|
|      1

### Please write the line of code to calculate the mean of the views

In [5]:
videoStats.select(mean('views').alias("Mean_Views")).show()

+----------+
|Mean_Views|
+----------+
|     553.4|
+----------+



### Please write the line of code to calculate the standard deviation of the population of the views

In [6]:
videoStats.select(stddev_pop('views').alias('Standard Deviation Population')).show()
# videoStats.select(stddev('views').alias('Standard Deviation Normal')).show()

+-----------------------------+
|Standard Deviation Population|
+-----------------------------+
|            276.5580228451165|
+-----------------------------+



### Please write the line of code to find the maximum of the views

In [7]:
videoStats.select(max('views').alias('Max Views'),sum('views').alias('Sum_Of_Views'),mean('views').alias('Mean_Views')).show()


+---------+------------+----------+
|Max Views|Sum_Of_Views|Mean_Views|
+---------+------------+----------+
|      940|       11068|     553.4|
+---------+------------+----------+



### Group the dataframe by the "trending_date" column and aggregate by the mean of the "views" column. The result should be put in a new dataframe named "videoStatGroup"

In [8]:
videoStatGroup = videoStats.groupby('trending_date').agg(mean('views'))
# videoStatGroup = videoStats.groupby('trending_date').mean('views')
# videoStatGroup = videoStats.groupby('trending_date').sum('views')
# videoStatGroup = videoStats.groupby('trending_date','title').sum('views')
videoStatGroup.show()

+-------------+-----------------+
|trending_date|       avg(views)|
+-------------+-----------------+
|           12|            940.0|
|            1|            254.0|
|            3|            582.5|
|            5|            165.0|
|            4|            638.0|
|            8|            316.0|
|            7|            316.0|
|           10|            799.5|
|           11|            405.0|
|            2|743.3333333333334|
+-------------+-----------------+

