In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.conf import SparkConf
config = SparkConf()
config.setMaster("spark://192.168.11.71:7077").setAppName("SparkDataFrameHdfs")
#config.setMaster("local[2]").setAppName("SparkDataFrameHdfs")
config.set("spark.executor.memory", "4g")
config.set("spark.executor.cores", 4)
config.set("spark.cores.max", 4)
config.set("spark.driver.memory", "4g")


from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=config).getOrCreate()


In [None]:
from pyspark.sql.types import StructType, LongType,StringType, IntegerType, DoubleType

movieSchema = StructType()\
         .add("movieId", IntegerType(), True)\
         .add("title", StringType(), True)\
         .add("genres", StringType(), True)\


ratingSchema = StructType()\
         .add("userId", IntegerType(), True)\
         .add("movieId", IntegerType(), True)\
         .add("rating", DoubleType(), True)\
         .add("timestamp", StringType(), True)\

In [None]:
movieDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(movieSchema)\
          .load("hdfs://192.168.93.128:9000/ml-latest-small/movies.csv")

ratingDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(ratingSchema)\
          .load("hdfs://192.168.93.128:9000/ml-latest-small/ratings.csv")

In [None]:
movieDf.show(2)
ratingDf.show(2)

In [None]:
# out of all 4 columns, we pick below 2 columns
df2 = ratingDf.select("movieId", "rating")
df2.show(2)

In [None]:
# count

print("Count ", ratingDf.count())

In [None]:
# to get all columns
print("Columns", ratingDf.columns)
# schema
print(ratingDf.schema)

In [None]:
# create and return new dataframe with 2 columns movieId, rating
df3 = ratingDf.select("movieId", "rating")
df3.printSchema()
df3.show(2)


In [None]:
movieDf.collect()

In [None]:
movieDf.take(2)

In [None]:
ratingDf.show(1)
# add new columns/drive new columns from existing data
df3 = ratingDf.withColumn("rating_adjusted", ratingDf.rating + .2  )
df3.show(1)

In [None]:
ratingDf.show(1)
# rename the column in the df
# existing col, new column
df2 = ratingDf.withColumnRenamed("rating", "ratings")
df2.show(1)

In [None]:
# select variance
# select all columns
df2 = ratingDf.select("*")
df2.show(1)
df2 = ratingDf.select("movieId", "rating")
df2.show(1)
# use .alias to give a name
df2 = ratingDf.select(ratingDf.userId, 
                     (ratingDf.rating + 0.2).alias("rating_adjusted") )
df2.show(1)

In [None]:
# filter, apply predicates/conditions
# filter, where functions. where is an alias of filter, both are same
df2 = ratingDf.filter(ratingDf.rating > 4)
df2.show(3)

df2 = ratingDf.where(ratingDf.rating > 4)
df2.show(3)

In [None]:
# multiple conditions
df2 = ratingDf.filter( (ratingDf.rating >=3) & (ratingDf.rating <=4))
df2.show(4)

In [None]:
from pyspark.sql.functions import col

df2 = ratingDf.filter( (col("rating") >=3) & (col("rating") <=4))
df2.show(4)

In [None]:
from pyspark.sql.functions import col, asc, desc
# sort data by ascending order/ default
df2 = ratingDf.sort("rating")
df2.show(5)
# sort data by ascending by explitly
df2 = ratingDf.sort(asc("rating"))
df2.show(5)
# sort data by descending order
df2 = ratingDf.sort(desc("rating"))
df2.show(5)

In [None]:
# aggregation count
from pyspark.sql.functions import col, desc, avg, count
# count, groupBy
# a movie, rated by more users, dones't count avg rating
# filter, ensure that total_ratings >= 100 users
mostPopularDf = ratingDf\
                .groupBy("movieId")\
                .agg(count("userId"))\
                .withColumnRenamed("count(userId)", "total_ratings")\
                .filter(col("total_ratings") >= 100)\
                .sort(desc("total_ratings"))\
                

mostPopularDf.show(200)

In [None]:
movieDf.show(2)

In [None]:
# join mostPopularmovie with movieDf, to get the title of the movie
mostPopularMoviesDf = mostPopularDf\
                      .join(movieDf, 
                            movieDf.movieId == mostPopularDf.movieId)\
                      .select(mostPopularDf.movieId, "title", "total_ratings")



mostPopularMoviesDf.show(5)

In [None]:
# perform two aggregates, count, avg, 

# aggregation of count of number of votes, +
# aggregation of avg voting
from pyspark.sql.functions import col, desc, avg, count
# count, groupBy
# a movie, rated by more users, dones't count avg rating
# filter, ensure that total_ratings >= 100 users
mostPopularDf = ratingDf\
                .groupBy("movieId")\
                .agg(count("userId").alias("total_ratings"), 
                     avg("rating").alias("avg_rating") )\
                .filter( (col("total_ratings") >= 100) &
                         (col("avg_rating") >= 3))\
                .sort(desc("total_ratings"))
                
mostPopularDf.show(200)

In [None]:
# join mostPopularmovie with movieDf, to get the title of the movie
mostPopularMoviesDf = mostPopularDf\
                      .join(movieDf, 
                            movieDf.movieId == mostPopularDf.movieId)\
                      .select(mostPopularDf.movieId, "title", "total_ratings", "avg_rating")



mostPopularMoviesDf.show(5)

In [None]:
# we will write the result to file system local
# create a folder named "output" in c:
# open command prompt
# run below command to assign permission 
#   winutils.exe chmod -R 777 c:\output

mostPopularMoviesDf.write.mode('overwrite')\
                         .csv("hdfs://192.168.93.128:9000/output/top-moives")

#mostPopularMoviesDf.toPandas()\
#                    .to_csv("c:/movies.csv")

In [None]:
# repartition to adjust partitions
# use it only to increase the number of partition, not to reduce it.
# reparition shall do shuffle always
# BAD approach, we use reparition for reducing number of parition, bad , due to shuffling performance
mostPopularMoviesDf.repartition(1).write.mode('overwrite')\
                         .csv("hdfs://192.168.93.128:9000/output/top-movies-one-file")


In [None]:
# coalesce helps to reparitions tooo
# should be used to reduce the partitions, should not be used to increase the parititions
# coalesce shall possibly reduce the shuffling, but cannot stop shuffling 
mostPopularMoviesDf.coalesce(1).write.mode('overwrite')\
                         .csv("hdfs://192.168.93.128:9000/output/top-movies-coalesce")

In [None]:
topMovieSchema = StructType()\
         .add("movieId", IntegerType(), True)\
         .add("title", StringType(), True)\
         .add("total_ratings", DoubleType(), True)\
         .add("avg_rating", DoubleType(), True)\

# Spark can read folder/directory, understanding the partitions files, ordering

topMovies = spark.read.format("csv")\
          .option("header", False)\
          .schema(topMovieSchema)\
          .load("hdfs://192.168.93.128:9000/output/top-moives")

topMovies.show()
