In [None]:
%scala
val storage_container = "data"
val storage_account_name = "<NAME>"
val storage_account_access_key = "<KEY>"
val storage_base_url = s"wasbs://${storage_container}@${storage_account_name}.blob.core.windows.net"

val file_type = "txt"
val train_data_url = s"${storage_base_url}/data/train/**/*.${file_type}"
val  test_data_url = s"${storage_base_url}/data/test/**/*.${file_type}"

In [None]:
%scala
import org.apache.spark.sql.functions.{col, when, split, size, lower, regexp_replace, input_file_name}
import org.apache.spark.sql.SparkSession

val spark = ( SparkSession.builder()
                          .appName("Pre-Processing-1")
                          .config(s"fs.azure.account.key.${storage_account_name}.blob.core.windows.net", storage_account_access_key)
                          .getOrCreate() )
val sc = spark.sparkContext

In [None]:
%scala
val trainDF = spark.read.text(train_data_url)
                        .withColumnRenamed("value", "comment")
                        .withColumn("fullPathFileName", input_file_name())
                        .withColumn("splitedFileName", split( col("fullPathFileName"), "/" ) )
                        .withColumn("fileName", col("splitedFileName")( size(col("splitedFileName")) - 1 ) )
                        .select(
                          split( split(col("fileName"), "\\.")(0), "_" )(0).cast("int").as("id"),
                          split( split(col("fileName"), "\\.")(0), "_" )(1).cast("int").as("rating"),
                          col("splitedFileName")( size(col("splitedFileName")) - 2 ).cast("string").as("type"),
                          regexp_replace( lower( col("comment") ), "<[^>]*>", "" ).cast("string").as("comment")
                        )
                        .withColumn(
                          "type",
                           when( col("type") === "pos", 1 )
                          .when( col("type") === "neg", 0 )
                          .otherwise(-1)
                        )
                        .repartition(1)
                        .cache()

val testDF =  spark.read.text(test_data_url)
                        .withColumnRenamed("value", "comment")
                        .withColumn("fullPathFileName", input_file_name())
                        .withColumn("splitedFileName", split( col("fullPathFileName"), "/" ) )
                        .withColumn("fileName", col("splitedFileName")( size(col("splitedFileName")) - 1 ) )
                        .select(
                          split( split(col("fileName"), "\\.")(0), "_" )(0).cast("int").as("id"),
                          split( split(col("fileName"), "\\.")(0), "_" )(1).cast("int").as("rating"),
                          col("splitedFileName")( size(col("splitedFileName")) - 2 ).cast("string").as("type"),
                          regexp_replace( lower( col("comment") ), "<[^>]*>", "" ).cast("string").as("comment")
                        )
                        .withColumn(
                          "type",
                           when( col("type") === "pos", 1 )
                          .when( col("type") === "neg", 0 )
                          .otherwise(-1)
                        )
                        .repartition(1)
                        .cache()

In [None]:
%scala
trainDF.show(10)

In [None]:
%scala
testDF.show(10)

In [None]:
%scala
trainDF.write
       .mode("overwrite")
       .option("header", "true")
       .format("com.databricks.spark.csv")
       .save(s"wasbs://${storage_container}@${storage_account_name}.blob.core.windows.net/data/train/train_data")

testDF.write
      .mode("overwrite")
      .option("header", "true")
      .format("com.databricks.spark.csv")
      .save(s"wasbs://${storage_container}@${storage_account_name}.blob.core.windows.net/data/test/test_data")

In [None]:
%scala
trainDF.write
       .mode("overwrite")
       .format("parquet")
       .save(s"wasbs://${storage_container}@${storage_account_name}.blob.core.windows.net/data/train/train_data")

testDF.write
      .mode("overwrite")
      .format("parquet")
      .save(s"wasbs://${storage_container}@${storage_account_name}.blob.core.windows.net/data/test/test_data")