In [1]:
# SQL, DF are just API , interfaces
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
config.setMaster("spark://192.168.11.71:7077").setAppName("SparkDataFrameHdfs")
#config.setMaster("local[2]").setAppName("SparkDataFrameHdfs")
config.set("spark.executor.memory", "4g")
config.set("spark.executor.cores", 4)
config.set("spark.cores.max", 4)
config.set("spark.driver.memory", "4g")


from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=config).getOrCreate()

In [3]:
from pyspark.sql.types import StructType, LongType,StringType, IntegerType, DoubleType

movieSchema = StructType()\
         .add("movieId", IntegerType(), True)\
         .add("title", StringType(), True)\
         .add("genres", StringType(), True)\


ratingSchema = StructType()\
         .add("userId", IntegerType(), True)\
         .add("movieId", IntegerType(), True)\
         .add("rating", DoubleType(), True)\
         .add("timestamp", StringType(), True)\

In [4]:
movieDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(movieSchema)\
          .load("hdfs://192.168.93.128:9000/ml-latest-small/movies.csv")

ratingDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(ratingSchema)\
          .load("hdfs://192.168.93.128:9000/ml-latest-small/ratings.csv")

In [5]:
movieDf.show(2)
ratingDf.show(2)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [8]:
# Spark SQL
# DF, we need register dataframe as a spark sql view
# movies is a temp view, created with in this application spark session
movieDf.createOrReplaceTempView ("movies")
ratingDf.createOrReplaceTempView("ratings")

In [14]:
# df.filter("Quality is NOT NULL") # SQL Flavor
# df.filter (col("Quality").isNotNull) # python flavor
df = spark.sql("select * from movies")
df.show(2)
df.printSchema()
#df.select("*").show(2)



+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



In [17]:
spark.sql("select * from ratings").show(3)
spark.sql("select movieId, rating from ratings").show(3)
spark.sql("select movieId as id, rating + .3 from ratings").show(3)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
+------+-------+------+---------+
only showing top 3 rows

+-------+------+
|movieId|rating|
+-------+------+
|      1|   4.0|
|      3|   4.0|
|      6|   4.0|
+-------+------+
only showing top 3 rows

+---+------------------------------+
| id|(rating + CAST(0.3 AS DOUBLE))|
+---+------------------------------+
|  1|                           4.3|
|  3|                           4.3|
|  6|                           4.3|
+---+------------------------------+
only showing top 3 rows



In [19]:
spark.sql("""select 
                    movieId as id, 
                    rating + .3 as rating 
                    from ratings
          """).show(3)

+---+------+
| id|rating|
+---+------+
|  1|   4.3|
|  3|   4.3|
|  6|   4.3|
+---+------+
only showing top 3 rows



In [21]:
spark.sql("select movieId, title from movies").show(2)

+-------+----------------+
|movieId|           title|
+-------+----------------+
|      1|Toy Story (1995)|
|      2|  Jumanji (1995)|
+-------+----------------+
only showing top 2 rows



In [23]:
spark.sql("select movieId, upper(title) as title from movies").show(2)

+-------+----------------+
|movieId|           title|
+-------+----------------+
|      1|TOY STORY (1995)|
|      2|  JUMANJI (1995)|
+-------+----------------+
only showing top 2 rows



In [24]:
spark.sql("select movieId, lower(title) as title from movies").show(2)

+-------+----------------+
|movieId|           title|
+-------+----------------+
|      1|toy story (1995)|
|      2|  jumanji (1995)|
+-------+----------------+
only showing top 2 rows



In [25]:
spark.sql("select movieId, initcap(lower(title)) as title from movies").show(2)

+-------+----------------+
|movieId|           title|
+-------+----------------+
|      1|Toy Story (1995)|
|      2|  Jumanji (1995)|
+-------+----------------+
only showing top 2 rows



In [35]:
# substr - pick from starting position till the length/count
spark.sql("select movieId, title, substr(title,5, 3) as title from movies").show(2)

+-------+----------------+-----+
|movieId|           title|title|
+-------+----------------+-----+
|      1|Toy Story (1995)|  Sto|
|      2|  Jumanji (1995)|  nji|
+-------+----------------+-----+
only showing top 2 rows



In [33]:
# left pick first few chars from left
# right - fict few chars from right
spark.sql("select movieId, left(title,4) as title from movies").show(2)
spark.sql("select movieId, right(title,4) as title from movies").show(2)

+-------+-----+
|movieId|title|
+-------+-----+
|      1| Toy |
|      2| Juma|
+-------+-----+
only showing top 2 rows

+-------+-----+
|movieId|title|
+-------+-----+
|      1| 995)|
|      2| 995)|
+-------+-----+
only showing top 2 rows



In [37]:
spark.sql("select movieId, count(userId) from ratings group by movieId").show(2)

+-------+-------------+
|movieId|count(userId)|
+-------+-------------+
|   1580|          165|
|   2366|           25|
+-------+-------------+
only showing top 2 rows



In [39]:
spark.sql("""select movieId, count(userId) as total_ratings from ratings 
                group by movieId 
                having total_ratings >= 100 """).show(100)

+-------+-------------+
|movieId|total_ratings|
+-------+-------------+
|   1580|          165|
|   1721|          140|
|    858|          192|
|   1270|          171|
|   1265|          143|
|    588|          183|
|    296|          307|
|  68954|          105|
|  58559|          149|
|    593|          279|
|    597|          135|
|   1198|          200|
|     34|          128|
|   8961|          125|
|   4896|          107|
|   2683|          121|
|    587|          115|
|   1210|          196|
|    253|          109|
|    329|          108|
|    223|          104|
|   1208|          107|
|   1291|          140|
|    924|          109|
|    367|          157|
|    539|          106|
|   1197|          142|
|   2628|          140|
|    111|          104|
|     47|          203|
|  33794|          116|
|   1206|          120|
|    185|          112|
|   7361|          131|
|    912|          100|
|   2329|          129|
|   4973|          120|
|      1|          215|
|   3147|       

In [40]:
spark.sql("""select movieId, count(userId) as total_ratings, avg(rating) as avg_rating from ratings 
                group by movieId 
                having total_ratings >= 100 and avg_rating >= 4 """).show(100)

+-------+-------------+------------------+
|movieId|total_ratings|        avg_rating|
+-------+-------------+------------------+
|    858|          192|         4.2890625|
|   1270|          171| 4.038011695906433|
|    296|          307| 4.197068403908795|
|  68954|          105| 4.004761904761905|
|  58559|          149| 4.238255033557047|
|    593|          279| 4.161290322580645|
|   1198|          200|            4.2075|
|   1210|          196| 4.137755102040816|
|   1208|          107| 4.219626168224299|
|   1291|          140| 4.046428571428572|
|   1197|          142| 4.232394366197183|
|    111|          104| 4.105769230769231|
|   7361|          131|4.1603053435114505|
|    912|          100|              4.24|
|   2329|          129| 4.217054263565892|
|   4973|          120| 4.183333333333334|
|   3147|          111| 4.148648648648648|
|   1089|          131| 4.202290076335878|
|   4993|          198| 4.106060606060606|
|   4995|          123|               4.0|
|  79132|  

In [44]:
# 1 represent the position in select from left movieId
spark.sql("""select movieId, count(userId) as total_ratings, avg(rating) as avg_rating from ratings 
                group by 1 
                having total_ratings >= 100 and avg_rating >= 4 """).show(2)

+-------+-------------+-----------------+
|movieId|total_ratings|       avg_rating|
+-------+-------------+-----------------+
|    858|          192|        4.2890625|
|   1270|          171|4.038011695906433|
+-------+-------------+-----------------+
only showing top 2 rows

