## Installation in databricks

On an existing cluster, add the following to the Advanced Options -> Spark tab:
``` 
  spark.kryoserializer.buffer.max 2000M
  spark.serializer org.apache.spark.serializer.KryoSerializer
```

In Libraries tab inside the cluster:
  * Install New -> PyPI -> spark-nlp==4.4.0 -> Install
  * Install New -> Maven -> Coordinates -> com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0 -> Install
  
Source - https://github.com/JohnSnowLabs/spark-nlp#databricks-cluster

In [0]:
%pip install sparknlp

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import sparknlp
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, DoubleType, IntegerType
from sparknlp.annotator import *
from sparknlp.base import *

In [0]:
data = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/tables/train.csv').limit(100)
data = data.withColumn("target", data["target"].cast(IntegerType()))
data.printSchema()

root
 |-- qid: string (nullable = true)
 |-- question_text: string (nullable = true)
 |-- target: integer (nullable = true)



In [0]:
document_assembler = DocumentAssembler().setInputCol("question_text") \
    .setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]) \
    .setOutputCol("token")


In [0]:
#takes a long time to download, so please be patient
embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx") \
    .setInputCols("document", "token") \
    .setOutputCol("embeddings")

glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ]

In [0]:
nlpPipeline = Pipeline(stages=[document_assembler,
                               tokenizer,
                               embeddings])

df = nlpPipeline.fit(data).transform(data)

In [0]:
def avg_vectors(word_vectors):
    length = len(word_vectors[0]["embeddings"])
    avg_vec = [0] * length
    for vec in word_vectors:
        for i, x in enumerate(vec["embeddings"]):
            avg_vec[i] += x
        avg_vec[i] = avg_vec[i] / length
    return avg_vec

def dense_vector(vec):
    return Vectors.dense(vec)

In [0]:
# create a udf
avg_vectors_udf = udf(avg_vectors, ArrayType(DoubleType()))
df_doc_vec = df.withColumn("doc_vector", avg_vectors_udf(col("embeddings")))

In [0]:
dense_vector_udf = udf(dense_vector, VectorUDT())
training = df_doc_vec.withColumn("features", dense_vector_udf(col("doc_vector")))

In [0]:
lr = LogisticRegression(labelCol="target", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrParisModel = lr.fit(training)