In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("Data Wrangling").getOrCreate()

In [3]:
%cd movielens
%ls

E:\Business Intelligence\PySpark 3 - Practice\movielens
 Volume in drive E is Study
 Volume Serial Number is 0D72-0390

 Directory of E:\Business Intelligence\PySpark 3 - Practice\movielens

10/04/2020  12:48 AM    <DIR>          .
10/04/2020  12:48 AM    <DIR>          ..
09/26/2018  03:50 PM           197,979 links.csv
09/26/2018  03:49 PM           494,431 movies.csv
09/26/2018  03:49 PM         2,483,723 ratings.csv
09/26/2018  03:50 PM             8,342 README.txt
09/26/2018  03:49 PM           118,660 tags.csv
               5 File(s)      3,303,135 bytes
               2 Dir(s)  61,938,475,008 bytes free


In [4]:
ratings = (
    
    spark.read.csv(
        path= "movielens/ratings.csv",
        sep = ",",
        quote='"',
        header=True,
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT"
        )
        .withColumn("timestamp",f.to_timestamp(f.from_unixtime("timestamp")))
)
movies = spark.read.csv(
        path="movielens/movies.csv",
        sep=",",
        quote='"',
        header=True,
        schema="movieID INT, title STRING, genres STRING"
    )

movie_genre = (
            movies 
             .withColumn("genre_array",f.split("genres","\|"))
             .withColumn("genre",f.explode("genre_array"))
             .select("movieID","title","genre")
)

available_genre = movie_genre.select("genre").distinct()
movies_with_no_genre = movie_genre.where(f.col("genre")=="(no genres listed)")

links = spark.read.csv(
             path="movielens/links.csv",
             sep=",",
             quote='"',
             header=True,
             schema="movieId INT, imdbId STRING, tmdbId INT"
        )

tags =  spark.read.csv(
             path="movielens/tags.csv",
             sep=",",
             quote='"',
             header=True,
             schema="userId INT, movieId INT, tag STRING, timestamp INT"
        ).withColumn("timestamp",f.to_timestamp(f.from_unixtime("timestamp")))

opinions = (
          movies
          .join(tags,["movieId"],"left")
          .withColumnRenamed("timestamp","timestamp_tag")
          .select("userId","movieID","title","tag","timestamp_tag")
    )

opinions_ext = (
            opinions
            .join(ratings,["movieID","userId"],"inner")
            .select("userId","movieID","title","tag","rating","timestamp_tag","timestamp")
        )