## Installation in databricks

On an existing cluster, add the following to the Advanced Options -> Spark tab:
``` 
  spark.kryoserializer.buffer.max 2000M
  spark.serializer org.apache.spark.serializer.KryoSerializer
```

In Libraries tab inside the cluster:
  * Install New -> PyPI -> spark-nlp==4.4.0 -> Install
  * Install New -> Maven -> Coordinates -> com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0 -> Install
  
Source - https://github.com/JohnSnowLabs/spark-nlp#databricks-cluster

In [0]:
import sparknlp
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, DoubleType, IntegerType
from sparknlp.annotator import *
from pyspark.ml.feature import VectorAssembler
from sparknlp.base import *
from pyspark.sql.functions import explode

In [0]:
data = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/tables/train.csv').limit(100)
data = data.withColumn("target", data["target"].cast(IntegerType()))
data.printSchema()

root
 |-- qid: string (nullable = true)
 |-- question_text: string (nullable = true)
 |-- target: integer (nullable = true)



In [0]:
#this will take some time to download, please be patient
useEmbeddings = UniversalSentenceEncoder.pretrained().setInputCols("document").setOutputCol("use_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ][OK!]


In [0]:
document_assembler = DocumentAssembler().setInputCol("question_text").setOutputCol("document")
embeddings_finisher = EmbeddingsFinisher().setInputCols(["use_embeddings"]).setOutputCols(["finished_use_embeddings"]) \
      .setOutputAsVector(True).setCleanAnnotations(False)

use_pipeline = Pipeline(stages=[
        document_assembler,
        useEmbeddings,
        embeddings_finisher])

use_df = use_pipeline.fit(data).transform(data)
use_df.select('finished_use_embeddings').show(3)
use_df= use_df.withColumn("features", explode(use_df.finished_use_embeddings))

+-----------------------+
|finished_use_embeddings|
+-----------------------+
|   [[-0.004166585393...|
|   [[0.0288510099053...|
|   [[-0.027575196698...|
+-----------------------+
only showing top 3 rows



In [0]:
(trainingData, testData) = use_df.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(labelCol="target", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrParisModel = lr.fit(trainingData)
predictions = lrParisModel.transform(testData)

predictions.select("target","prediction").show(n = 10, truncate = 30)

+------+----------+
|target|prediction|
+------+----------+
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
+------+----------+
only showing top 10 rows

