In [0]:
val filePath = "bitcoin.csv"

In [1]:
val rawDF = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath)

z.show(rawDF)

### 1. Column Description Infer
First I convert the column name to more readable names since the database lack column description.

In [3]:
val df1 = rawDF.withColumnRenamed("timestamp", "Index")
z.show(df1)

In [4]:
import org.apache.spark.sql.functions.{min, max}
import org.apache.spark.sql.Row

val Row(minValue: Double, maxValue: Double) = df1.agg(min("Close"), max("Close")).head

From here we can infer the currency is US dollar

In [6]:
val df2 = df1.withColumnRenamed("Volume_(BTC)", "btc_volume").withColumnRenamed("Volume_(Currency)", "usd_volume")

In [7]:
z.show(df2)

 

### 2. Datatype check

In [9]:
df2.printSchema

The data type is fine. The 'infer' option has already convert the String type timestamp to timestamp data type

### 3. statistical check

In [12]:
z.show(df2.describe())


In [13]:
val imputeCols = Array(
  "Open",
  "High",
  "Low", 
  "Close",
  "btc_volume",
  "usd_volume",
  "Weighted_Price",
)

In [14]:
import org.apache.spark.ml.feature.Imputer

val imputer = new Imputer()
  .setStrategy("median")
  .setInputCols(imputeCols)
  .setOutputCols(imputeCols)
  
val imputedDF = imputer.fit(df2).transform(df2)

z.show(imputedDF.describe())
z.show(imputedDF)

### 4. Getting rid of extreme or abnormal values

In [16]:
imputedDF.filter($"Weighted_Price" <= 0).count

In [17]:
val valueDF = imputedDF.withColumn("PctChange",($"Close"- $"Open")/$"Open")

In [18]:
z.show(valueDF)
z.show(valueDF.describe())

The percentage of price change with one minute is reasonable. 

At most 41% increase and less than 100% decrease. Standard deviation is 0.2%.

In [20]:
import org.apache.spark.sql.expressions.Window

val w = Window.orderBy("Index")

In [21]:
val df3 = imputedDF.withColumn("return", (col("Close") - lag("Close", 1).over(w)) / lag("close", 1).over(w))

In [22]:
z.show(df3)

In [23]:
z.show(df3.describe())

In [24]:
val df4 = df3.na.drop("any")

In [25]:
z.show(df4)

In [26]:
val df5 = df4.withColumn("Volatility", $"High"-$"Low")
z.show(df5)

In [27]:
z.show(df5.describe())

In [28]:
val outputPath = "bitcoin-clean.parquet"

In [29]:
df5.write.mode("overwrite").parquet(outputPath)