In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('movieAnlytics').master('local').config('spark.ui.port','12443').enableHiveSupport().getOrCreate()

spark

##Analytical Queries solved using RDD

1. what are the top 10 most viewed movies?
2. What are the distinct list of genre available?
3. How many movies for each genre?
4. How many movies are starting with number or letter (Example:starting with 1/2/3../A/B/C..Z)?
5. List the latest released movies



In [0]:
top_10 =  sc.textFile('dbfs:/FileStore/tables/ratings.dat').map(lambda x:x.split('::')).map(lambda x:(x[1],1)).map(lambda x:(int(x[0]),1)).reduceByKey(lambda x,y:x+y).sortBy(lambda x:x[1],ascending=False)

In [0]:
movie_title = sc.textFile('dbfs:/FileStore/tables/movies.dat').map(lambda x:x.split('::')).map(lambda x:(int(x[0]),x[1]))

all_top_movies = movie_title.join(top_10)
all_top_movies.sortBy(lambda x:x[1][1], ascending=False).take(10)

Out[4]: [(2858, ('American Beauty (1999)', 3428)),
 (260, ('Star Wars: Episode IV - A New Hope (1977)', 2991)),
 (1196, ('Star Wars: Episode V - The Empire Strikes Back (1980)', 2990)),
 (1210, ('Star Wars: Episode VI - Return of the Jedi (1983)', 2883)),
 (480, ('Jurassic Park (1993)', 2672)),
 (2028, ('Saving Private Ryan (1998)', 2653)),
 (589, ('Terminator 2: Judgment Day (1991)', 2649)),
 (2571, ('Matrix\t The (1999)', 2590)),
 (1270, ('Back to the Future (1985)', 2583)),
 (593, ('Silence of the Lambs\t The (1991)', 2578))]

What are the distinct list of genres available?

In [0]:
genres = sc.textFile("dbfs:/FileStore/tables/movies.dat").map(lambda x:x.split("::")[2].split("|")).flatMap(lambda x:x)
genre_len = len(set(genres.collect()))
set(genres.collect())

Out[5]: {'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

How Many movies for each genres?

In [0]:
no_of_movies_each_genre = genres.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).toDF(schema="Genre string, No int")

In [0]:
display(no_of_movies_each_genre)

Genre,No
Children's,251
Fantasy,68
Romance,471
Drama,1603
Action,503
Thriller,492
Horror,343
Sci-Fi,276
Documentary,127
Musical,114


Databricks visualization. Run in Databricks to view.

In [0]:
import re

pattern_n = re.compile('^\d+')

def movie_name_(str):
    if len(pattern_n.findall(str))>0:
        return ('startsWithDigit',1)
    else:
        return ('startsWithLetter',1)

How many movies are starting with   numbers or letters (Example: Starting with 1/2/3../A/B/C..Z)?

In [0]:
movie_name = sc.textFile('dbfs:/FileStore/tables/movies.dat').map(lambda x:x.split("::")[1]).map(lambda x : movie_name_(x)).reduceByKey(lambda x,y:x+y).toDF()

In [0]:
display(movie_name)

_1,_2
startsWithLetter,3853
startsWithDigit,30


List the latest released movie

In [0]:
latest = sc.textFile("dbfs:/FileStore/tables/movies.dat").map(lambda x:x.split("::")).map(lambda x:x[1]).map(lambda x:(x,int(re.findall('\d+',x.split()[-1])[0])))

display(latest.sortBy(lambda x:x[1],ascending=False).toDF())

_1,_2
Supernova (2000),2000
Down to You (2000),2000
Isn't She Great? (2000),2000
Scream 3 (2000),2000
Gun Shy (2000),2000
Beach The (2000),2000
Snow Day (2000),2000
Tigger Movie The (2000),2000
Trois (2000),2000
Boiler Room (2000),2000


##Using spark sql to do the following tasks
1. create table for movies.dat, users.dat and ratings.dat: Saving Tables from Spark SQL
2. Find the list of oldest released movies
3. How many movies released each year?
4. How many number of movies are there for each rating?
5. How many users have rated each movie?

In [0]:
spark.sql('drop table if exists movies3_')
spark.sql('create database if not exists movies3_')

Out[49]: DataFrame[]

In [0]:
spark.sql("use movies3_")
spark.sql('''
create table ratings  (user_id string, movie_id string, ratings string,time_stamp string)
row format 
      delimited fields terminated by '::'
      stored as textfile
      
''')


spark.sql('''
          create table movies (movie_id string, title string, genre string)
          row format
          delimited fields terminated by '::'
          stored as textfile


 ''')


#we have user_id, gender, age, Occupation, zip_code

spark.sql('''
    create table users (user_id string, gender string, age string, occupation string, zip_code string)
    row format
    delimited fields terminated by '::'
    stored as textfile


''')

Out[50]: DataFrame[]

In [0]:
spark.read.csv("dbfs:/FileStore/tables/movies.dat", sep="::").write.insertInto('movies')
spark.read.csv("dbfs:/FileStore/tables/users.dat", sep="::").write.insertInto('users')
spark.read.csv("dbfs:/FileStore/tables/ratings.dat", sep="::").write.insertInto('ratings')

list of oldest released movies

In [0]:
from pyspark.sql.functions import pandas_udf, PandasUDFType,udf,to_date,col,date_format
@udf('string')
def getting_Year(str1):
    return str1.split()[-1]


moviesM = spark.read.table("movies")
print(moviesM.count())

display(moviesM.select(date_format(to_date(getting_Year(col('title')),'(yyyy)'),'yyyy').alias('year'),'title').filter(col('year') > 1).orderBy(col('year')))

3883


year,title
1919,Male and Female (1919)
1919,Daddy Long Legs (1919)
1920,Saphead The (1920)
1920,Dog's Life A (1920)
1921,Kid The (1921)
1922,Nosferatu (Nosferatu eine Symphonie des Grauens) (1922)
1922,Tess of the Storm Country (1922)
1923,Always Tell Your Wife (1923)
1923,Three Ages The (1923)
1923,Woman of Paris A (1923)


How many movies are relesed each year

In [0]:
display(moviesM.select(date_format(to_date(getting_Year(col('title')),'(yyyy)'),'yyyy').alias('year')).groupBy(col('year')).count())

year,count
1953.0,14
1957.0,20
1987.0,64
1956.0,18
1936.0,8
1958.0,22
1943.0,10
1972.0,22
1931.0,7
1988.0,58


In [0]:
ratings = spark.read.csv('dbfs:/FileStore/tables/ratings.dat',sep='::').toDF('user_id','movie_id','ratings','timestamp')
ratings.createOrReplaceTempView('ratings_')
spark.sql('select count(avg_ratings) as no_of_ratings,avg_ratings from (select round(avg(ratings)) as avg_ratings, movie_id from ratings_ group by movie_id) group by avg_ratings').show()

+-------------+-----------+
|no_of_ratings|avg_ratings|
+-------------+-----------+
|           43|        1.0|
|         1445|        4.0|
|         1730|        3.0|
|          459|        2.0|
|           29|        5.0|
+-------------+-----------+



no. of ratings for each movie

In [0]:
movies = spark.read.csv('dbfs:/FileStore/tables/movies.dat',sep='::').toDF('movie_id','title','genre')
movies.createOrReplaceTempView('movies_')
display(spark.sql('''
select b.title, a.no_of_ratings from (select movie_id, count(movie_id) as no_of_ratings from ratings_ group by movie_id) as a left join movies_ as b on a.movie_id  = b.movie_id
'''))

title,no_of_ratings
Antz (1998),645
Platoon (1986),1143
Pulp Fiction (1994),2171
Nutty Professor The (1963),222
Fast Times at Ridgemont High (1982),886
Live Nude Girls (1995),54
Popeye (1980),471
Mrs. Winterbourne (1996),121
Joe's Apartment (1996),131
NeverEnding Story II: The Next Chapter The (1990),177


##Spark Data frame 
1. Prepare Movies data: Extracting the year and genre from the text
2. Prepare users data: Loadig a double delimited csv file
3. Prepare Rating data: Programmatically specifying a schema for the data frame 
4. Import Data from URL:Scala
5. Save table without defining DDL in hive
6. Broadcast Variable example
7. Accumulator example
 

In [0]:
movies1 = spark.read.csv('dbfs:/FileStore/tables/movies.dat',sep='::').toDF('movie_id','title','genre')
from pyspark.sql.functions import array
@udf
def get_array(str1):
    return str1.split('|')
display(movies1.select(date_format(to_date(getting_Year(col('title')),'(yyyy)'),'yyyy').alias('year'),col('title').alias('title'),'movie_id',get_array(col('genre')).alias('genre')))

year,title,movie_id,genre
1995.0,Toy Story (1995),1,"[Animation, Children's, Comedy]"
1995.0,Jumanji (1995),2,"[Adventure, Children's, Fantasy]"
1995.0,Grumpier Old Men (1995),3,"[Comedy, Romance]"
1995.0,Waiting to Exhale (1995),4,"[Comedy, Drama]"
1995.0,Father of the Bride Part II (1995),5,[Comedy]
1995.0,Heat (1995),6,"[Action, Crime, Thriller]"
1995.0,Sabrina (1995),7,"[Comedy, Romance]"
1995.0,Tom and Huck (1995),8,"[Adventure, Children's]"
1995.0,Sudden Death (1995),9,[Action]
1995.0,GoldenEye (1995),10,"[Action, Adventure, Thriller]"


Prepare users data: Loadig a double delimited csv file

In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

#user_id,gender, age,occupation,zip
schema = StructType([StructField('user_id',IntegerType(),True),StructField('gender',StringType(),True),StructField('age',IntegerType(),True),StructField('occupation',IntegerType(),True),StructField('zip_code',IntegerType(),True)])
users1 = spark.read.csv('dbfs:/FileStore/tables/users.dat',sep="::",schema=schema)
display(users1)

user_id,gender,age,occupation,zip_code
1,F,1,10,48067.0
2,M,56,16,70072.0
3,M,25,15,55117.0
4,M,45,7,2460.0
5,M,25,20,55455.0
6,F,50,9,55117.0
7,M,35,1,6810.0
8,M,25,12,11413.0
9,M,25,17,61614.0
10,F,35,1,95370.0


Prepare Rating data: Programmatically specifying a schema for the data frame

In [0]:
ratings_schema = StructType([StructField('user_id',IntegerType(),True),StructField('movie_id',IntegerType(),True),StructField('ratings',IntegerType(),True),StructField('timestamp',StringType(),True)])

#user_id, movie_id,ratings,timestamp

In [0]:
ratings_ = spark.read.csv('dbfs:/FileStore/tables/ratings.dat',sep="::",schema=ratings_schema)
#import data from url
ratings_.createOrReplaceTempView('ratings_1')

loading data from url

In [0]:
url = "https://raw.githubusercontent.com/Thomas-George-T/Movies-Analytics-in-Spark-and-Scala/master/Movielens/users.dat"

from pyspark import SparkFiles
spark.sparkContext.addFile(url)

df = spark.read.csv("file://"+SparkFiles.get("users.dat"),sep='::').show()

+---+---+---+---+-----+
|_c0|_c1|_c2|_c3|  _c4|
+---+---+---+---+-----+
|  1|  F|  1| 10|48067|
|  2|  M| 56| 16|70072|
|  3|  M| 25| 15|55117|
|  4|  M| 45|  7|02460|
|  5|  M| 25| 20|55455|
|  6|  F| 50|  9|55117|
|  7|  M| 35|  1|06810|
|  8|  M| 25| 12|11413|
|  9|  M| 25| 17|61614|
| 10|  F| 35|  1|95370|
| 11|  F| 25|  1|04093|
| 12|  M| 25| 12|32793|
| 13|  M| 45|  1|93304|
| 14|  M| 35|  0|60126|
| 15|  M| 25|  7|22903|
| 16|  F| 35|  0|20670|
| 17|  M| 50|  1|95350|
| 18|  F| 18|  3|95825|
| 19|  M|  1| 10|48073|
| 20|  M| 25| 14|55113|
+---+---+---+---+-----+
only showing top 20 rows



We have our ratings parquet
1. let list all the tables and current database
2. spark.catalog.createTable(tableName='ratings_parquet',path='/user/itv003220/ratings_parquet'schema=schema,source='parquet')


In [0]:
spark.catalog.currentDatabase()

Out[61]: 'movies3_'

In [0]:
spark.catalog.listTables()

Out[62]: [Table(name='movies', catalog='spark_catalog', namespace=['movies3_'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='ratings', catalog='spark_catalog', namespace=['movies3_'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='users', catalog='spark_catalog', namespace=['movies3_'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='movies_', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='ratings_', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='ratings_1', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [0]:
spark.sql("drop table if exists ddl_table")
spark.catalog.createTable(tableName='ddl_table',schema=ratings_schema,source='parquet')

Out[63]: DataFrame[user_id: int, movie_id: int, ratings: int, timestamp: string]

In [0]:
spark.catalog.listTables()

#we created table in spark without using hive ddl

Out[64]: [Table(name='ddl_table', catalog='spark_catalog', namespace=['movies3_'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='movies', catalog='spark_catalog', namespace=['movies3_'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='ratings', catalog='spark_catalog', namespace=['movies3_'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='users', catalog='spark_catalog', namespace=['movies3_'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='movies_', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='ratings_', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='ratings_1', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [0]:
spark.sql("select * from ddl_table")

Out[65]: DataFrame[user_id: int, movie_id: int, ratings: int, timestamp: string]

In [0]:
ratings_.write.insertInto('ddl_table')
spark.sql("select * from ddl_table").show()

+-------+--------+-------+---------+
|user_id|movie_id|ratings|timestamp|
+-------+--------+-------+---------+
|      1|    1193|      5|978300760|
|      1|     661|      3|978302109|
|      1|     914|      3|978301968|
|      1|    3408|      4|978300275|
|      1|    2355|      5|978824291|
|      1|    1197|      3|978302268|
|      1|    1287|      5|978302039|
|      1|    2804|      5|978300719|
|      1|     594|      4|978302268|
|      1|     919|      4|978301368|
|      1|     595|      5|978824268|
|      1|     938|      4|978301752|
|      1|    2398|      4|978302281|
|      1|    2918|      4|978302124|
|      1|    1035|      5|978301753|
|      1|    2791|      4|978302188|
|      1|    2687|      3|978824268|
|      1|    2018|      4|978301777|
|      1|    3105|      5|978301713|
|      1|    2797|      4|978302039|
+-------+--------+-------+---------+
only showing top 20 rows



##Example of broadcast variable 
##let see the occupation mapping in read.me in ml-1m
##we will use the occupation mapping to explain broadcast variables

In [0]:
dic = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clearical/admin",
    4: "collegue/Hrad Student",
    5: "customer Service",
    6: "doctor/healthcare",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 Student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employeed",
    17: "technician/engineer",
    18: "tradesman/craftman",
    19: "unemployeed",
    20: "writer"   
    
}

In [0]:
broad_cast = sc.broadcast(dic)

In [0]:
@udf
def get_occu(int1):
    return broad_cast.value[int1]
    

In [0]:
users_ = spark.read.csv("dbfs:/FileStore/tables/users.dat", sep="::",schema=schema)
users_accu = users_.select(get_occu(col("occupation")).alias('actual_occupation'),col("user_id"))

users_accu.show()

#we use borad cast var s to map occupation keys to actual strings as in read.me avaialable

+--------------------+-------+
|   actual_occupation|user_id|
+--------------------+-------+
|        K-12 Student|      1|
|      self-employeed|      2|
|           scientist|      3|
|executive/managerial|      4|
|              writer|      5|
|           homemaker|      6|
|   academic/educator|      7|
|          programmer|      8|
| technician/engineer|      9|
|   academic/educator|     10|
|   academic/educator|     11|
|          programmer|     12|
|   academic/educator|     13|
|               other|     14|
|executive/managerial|     15|
|               other|     16|
|   academic/educator|     17|
|     clearical/admin|     18|
|        K-12 Student|     19|
|     sales/marketing|     20|
+--------------------+-------+
only showing top 20 rows



In [0]:
acc = sc.accumulator(0)
#Accumulators are the shared variables across the cluster
#lets how many programmers are there in occupation with the help of accumulator
#getting no. of partitions in accumulator

In [0]:
users_accu.printSchema()

root
 |-- actual_occupation: string (nullable = true)
 |-- user_id: integer (nullable = true)



In [0]:
users_accu.rdd.glom().foreach(lambda x:acc.add(1))

In [0]:
acc.value
accu_pro = sc.accumulator(0)
#here is another example

In [0]:
@udf
def getting_progr(str1):
    if str1=="programmer":
        accu_pro.add(1)
        return True
    else:
        return False
    
    

In [0]:
users_accu.filter(getting_progr(col('actual_occupation'))==True).show()

+-----------------+-------+
|actual_occupation|user_id|
+-----------------+-------+
|       programmer|      8|
|       programmer|     12|
|       programmer|     43|
|       programmer|     49|
|       programmer|     55|
|       programmer|     65|
|       programmer|    104|
|       programmer|    105|
|       programmer|    108|
|       programmer|    113|
|       programmer|    155|
|       programmer|    180|
|       programmer|    195|
|       programmer|    198|
|       programmer|    205|
|       programmer|    207|
|       programmer|    220|
|       programmer|    252|
|       programmer|    267|
|       programmer|    268|
+-----------------+-------+
only showing top 20 rows



In [0]:
accu_pro.value

#we are going to verify wheather if our accumul value is correct or not by running below command and checking 
#acc_var counts total number of users who are programmers in occupation

Out[77]: 0

In [0]:
users_accu.filter(col('actual_occupation') == 'programmer').count()

#lets verify it

Out[78]: 388

In [0]:
#thankyou
#lets do it by cmd using cmd /shell
