# [databricks](https://community.cloud.databricks.com/)


In [None]:
"""
DBFS - Databricks File System
Abstraction over HDFS, S3, Azure DataBlobs

Create a mount over S3 bucket, then access file via dbfs://mnt/bucketname
Accessing dbfs://mnt/bucketname will link to the S3 bucket

Write results back to S3 locations as CSV/Parquet
"""

In [None]:
"""
CAN BE MOVED TO A SEPARATE FILE (mount-s3.ipynb)
"""

access_key = "YOUR_ACCESS_KEY"
secret_key = "YOUR_SECRET_KEY"
bucket_name = "bond-s3-forspark"  # AWS S3 bucket

In [None]:
help(dbutils)

```python
Help on DBUtils in module dbutils object:

class DBUtils(builtins.object)
 |  DBUtils(py_shell, entry_point)
 |  
 |  This class provides dbutils functionality for python notebooks, just like dbutils_v1.scala does
 |  it for Scala. For each of the calls here, we do two things: check whether the passed types are
 |  correct, and if so make a corresponding call to FSUtils object in Scala. For ls and mounts we do
 |  one extra thing - instead of returning result directly, we create a PythonSchemaSeq from it
 |  first. This is done to enable further operations with the result (e.g. call display function
 |  on it, or perform list operations on it)
 |  
 |  Methods defined here:
 |  
 |  __call__(self)
 |      Call self as a function.
 |  
 |  __getattr__(self, item)
 |  
 |  __getstate__(self)
 |  
 |  __init__(self, py_shell, entry_point)
 |      :param py_shell: the PythonShell object
 |      :return:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  help(self, method_name='')
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  CredentialsHandler = <class 'dbutils.DBUtils.CredentialsHandler'>
 |  
 |  
 |  FSHandler = <class 'dbutils.DBUtils.FSHandler'>
 |  
 |  
 |  LibraryHandler = <class 'dbutils.DBUtils.LibraryHandler'>
 |  
 |  
 |  NotebookHandler = <class 'dbutils.DBUtils.NotebookHandler'>
 |  
 |  
 |  PreviewHandler = <class 'dbutils.DBUtils.PreviewHandler'>
 |      NoOp Preview Handler which is necessary at this moment, as we want Secret module to be under
 |      preview module. i.e. dbutils.preview.secret
 |      
 |      TODO(kevin) After secret module moves out of preview phase, remove this Hanlder and make
 |                  other corresponding changes to directly call secret module.
 |  
 |  SecretsHandler = <class 'dbutils.DBUtils.SecretsHandler'>

```

In [None]:
"""
CAN BE MOVED TO A SEPARATE FILE (mount-s3.ipynb)
"""

encoded_secret_key = secret_key.replace("/", "%2F")
aws_bucket_name = bucket_name
mount_name = bucket_name

# if yours is terminated, you need to mount it again.. by running this code..
# create a shortcut/mount point /mnt/<<name>> but this points to S3 storages, DBFS
# hence s3 files can be accessible either usiing /mnt/bucketname or dbfs://mnt/bucketname
dbutils.fs.mount("s3a://%s:%s@%s" % (access_key, encoded_secret_key, aws_bucket_name), "/mnt/%s" % mount_name)

In [None]:
display(dbutils.fs.ls("/mnt/%s" % mount_name))

```bash
dbfs:/mnt/bond-s3-forspark/movielens/
movielens/
0
```

In [None]:
" Same as above "
display(dbutils.fs.ls("dbfs:///mnt/%s" % mount_name))

In [None]:
display(dbutils.fs.ls("dbfs:///mnt/%s/movielens" % mount_name))

In [None]:
from pyspark.sql.types import StructType, LongType, StringType, IntegerType, DoubleType

movieSchema = (
    StructType()
    .add("movieId", IntegerType(), True)
    .add("title", StringType(), True)
    .add("genres", StringType(), True)
)

ratingSchema = (
    StructType()
    .add("userId", IntegerType(), True)
    .add("movieId", IntegerType(), True)
    .add("rating", DoubleType(), True)
    .add("timestamp", StringType(), True)
)

In [None]:
moviePath = "dbfs:///mnt/%s/movielens/movies.csv" % mount_name
ratingPath = "dbfs:///mnt/%s/movielens/ratings.csv" % mount_name

movieDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(movieSchema)\
          .load(moviePath)

ratingDf = spark.read.format("csv")\
          .option("header", True)\
          .schema(ratingSchema)\
          .load(ratingPath)b

In [None]:
movieDf.show(2)
ratingDf.show(2)

In [None]:
from pyspark.sql.functions import col, desc, avg, count

# count, groupBy
# a movie, rated by more users, dones't count avg rating
# filter, ensure that total_ratings >= 100 users
mostPopularDf = ratingDf\
                .groupBy("movieId")\
                .agg(count("userId").alias("total_ratings"), 
                     avg("rating").alias("avg_rating") )\
                .filter( (col("total_ratings") >= 100) &
                         (col("avg_rating") >= 3))\
                .sort(desc("total_ratings"))
                
mostPopularDf.show(10)

In [None]:
# join mostPopularmovie with movieDf, to get the title of the movie
mostPopularMoviesDf = mostPopularDf\
                      .join(movieDf, 
                            movieDf.movieId == mostPopularDf.movieId)\
                      .select(mostPopularDf.movieId, "title", "total_ratings", "avg_rating")
mostPopularMoviesDf.show(10)

In [None]:
cachedDf = mostPopularMoviesDf.coalesce(1)
cachedDf.cache() # df is cached

In [None]:
popular_movies_path = "dbfs:///mnt/%s/movielens-results/csv/popular-movies.csv" % mount_name
cachedDf.write.mode('overwrite')\
    .csv(popular_movies_path)

In [None]:
popular_movies_path = "dbfs:///mnt/%s/movielens-results/json/popular-movies.json" % mount_name
cachedDf.write.mode('overwrite')\
        .json(popular_movies_path)

In [None]:
popular_movies_path = "dbfs:///mnt/%s/movielens-results/parquet/popular-movies.parquet" % mount_name
cachedDf.write.mode('overwrite')\
        .parquet(popular_movies_path)

In [None]:
# not working, may be due to writer format not available..
"""
popular_movies_path = "dbfs:///mnt/%s/movielens-results/xml/popular-movies.xml" % mount_name
cachedDf.write.mode('overwrite')\
              .format("com.databricks.spark.xml")\
              .option("rootTag", "movies")\
              .option("rowTag", "movie")\
              .save(popular_movies_path)
"""