## Installation in databricks

On an existing cluster, add the following to the Advanced Options -> Spark tab:
``` 
  spark.kryoserializer.buffer.max 2000M
  spark.serializer org.apache.spark.serializer.KryoSerializer
```

In Libraries tab inside the cluster:
  * Install New -> PyPI -> spark-nlp==4.4.0 -> Install
  * Install New -> Maven -> Coordinates -> com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0 -> Install
  
Source - https://github.com/JohnSnowLabs/spark-nlp#databricks-cluster

In [0]:
%pip install sparknlp

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import sparknlp
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, DoubleType, IntegerType
from sparknlp.annotator import *
from sparknlp.base import *

In [0]:
sample_size=1000

data = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/tables/train.csv').limit(sample_size)
data = data.withColumn("target", data["target"].cast(IntegerType()))
data.printSchema()
data.count()

root
 |-- qid: string (nullable = true)
 |-- question_text: string (nullable = true)
 |-- target: integer (nullable = true)

Out[2]: 1000

In [0]:
def stratified_split(data_df):

    zeros = data_df.select("*") \
        .where("target == '0'")
    ones = data_df.select("*") \
        .where("target == '1'")
    print("original : zeroes count & one count : ", zeros.count(), ones.count(), ones.count() / zeros.count())
    print('percentage of original insincere qs -> {}'.format(ones.count() * 100 / zeros.count()))
    # split datasets into training and testing
    train0, test0 = zeros.randomSplit([0.8, 0.2], seed=1234)
    train1, test1 = ones.randomSplit([0.8, 0.2], seed=1234)

    # stack datasets back together
    train = train0.union(train1)
    test = test0.union(test1)

    print('percentage of train insincere qs -> {}'.format(train1.count() * 100 / train0.count()))
    train.groupby("target").count().show()

    print('percentage of test insincere qs -> {}'.format(test1.count() * 100 / test0.count()))
    test.groupby("target").count().show()
    stratifiedTrainData = train.sample(False, (data_df.count()/10) / len(train.collect()))


    x_zeros = stratifiedTrainData.select("*") \
        .where("target == '0'")
    x_ones = stratifiedTrainData.select("*") \
        .where("target == '1'")

    print('percentage of stratified train insincere qs -> {}'.format(x_ones.count() * 100 / x_zeros.count()))
    train10000.select("*") \
        .where("target == '0'") \
        .show(5)

    train10000.select("*") \
        .where("target == '1'") \
        .show(5)

    return stratifiedTrainData, train, test

In [0]:
#### This function to be deleted" 
def load_data(data_df):

    

    train10000, train, test =stratified_split(data_df)

    #Just display for time being , delete in final version
    train10000.select("*") \
        .where("target == '0'") \
        .show(5)

    train10000.select("*") \
        .where("target == '1'") \
        .show(5)
    return train10000, train, test

In [0]:
#### This function to be deleted , not used any more
def process(data):

    #load_data(data)

    train10000, train, test = load_data()
    train_x = train10000.select("question_text")
    train_y = train10000.select("target")
    test_x = test.select("question_text")
    test_y = test.select("target")

    #predicted = word_embedding_model_logistic(train_x, train_y, test_x)
    #print ('prediction done')
    #print_performance_report(test_y, predicted, 'Logistic regression model With Glove embeddings')

In [0]:
train10000, train, test =stratified_split(data)

original : zeroes count & one count :  951 48 0.050473186119873815
percentage of original insincere qs -> 5.047318611987381


In [0]:
document_assembler = DocumentAssembler().setInputCol("question_text") \
    .setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]) \
    .setOutputCol("token")


In [0]:
#takes a long time to download, so please be patient
embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx") \
    .setInputCols("document", "token") \
    .setOutputCol("embeddings")

glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[ | ][OK!]


In [0]:
nlpPipeline = Pipeline(stages=[document_assembler,
                               tokenizer,
                               embeddings])

df = nlpPipeline.fit(data).transform(data)

In [0]:
df.show(10)

+--------------------+--------------------+------+--------------------+--------------------+--------------------+
|                 qid|       question_text|target|            document|               token|          embeddings|
+--------------------+--------------------+------+--------------------+--------------------+--------------------+
|00002165364db923c7e6|How did Quebec na...|     0|[{document, 0, 71...|[{token, 0, 2, Ho...|[{word_embeddings...|
|000032939017120e6e44|Do you have an ad...|     0|[{document, 0, 80...|[{token, 0, 1, Do...|[{word_embeddings...|
|0000412ca6e4628ce2cf|Why does velocity...|     0|[{document, 0, 66...|[{token, 0, 2, Wh...|[{word_embeddings...|
|000042bf85aa498cd78e|How did Otto von ...|     0|[{document, 0, 56...|[{token, 0, 2, Ho...|[{word_embeddings...|
|0000455dfa3e01eae3af|Can I convert mon...|     0|[{document, 0, 76...|[{token, 0, 2, Ca...|[{word_embeddings...|
|00004f9a462a357c33be|Is Gaza slowly be...|     0|[{document, 0, 71...|[{token, 0, 1, Is

In [0]:
display(df.count())

1000

In [0]:
def avg_vectors(word_vectors):
    length = len(word_vectors[0]["embeddings"])
    avg_vec = [0] * length
    for vec in word_vectors:
        for i, x in enumerate(vec["embeddings"]):
            avg_vec[i] += x
        avg_vec[i] = avg_vec[i] / length
    return avg_vec

def dense_vector(vec):
    return Vectors.dense(vec)

In [0]:
# create a udf
avg_vectors_udf = udf(avg_vectors, ArrayType(DoubleType()))
df_doc_vec = df.withColumn("doc_vector", avg_vectors_udf(col("embeddings")))

In [0]:
df_doc_vec.show(10)

+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|                 qid|       question_text|target|            document|               token|          embeddings|          doc_vector|
+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|00002165364db923c7e6|How did Quebec na...|     0|[{document, 0, 71...|[{token, 0, 2, Ho...|[{word_embeddings...|[0.38199613126926...|
|000032939017120e6e44|Do you have an ad...|     0|[{document, 0, 80...|[{token, 0, 1, Do...|[{word_embeddings...|[-0.7888190010562...|
|0000412ca6e4628ce2cf|Why does velocity...|     0|[{document, 0, 66...|[{token, 0, 2, Wh...|[{word_embeddings...|[0.04380194842815...|
|000042bf85aa498cd78e|How did Otto von ...|     0|[{document, 0, 56...|[{token, 0, 2, Ho...|[{word_embeddings...|[1.12683903053402...|
|0000455dfa3e01eae3af|Can I convert mon...|     0|[{doc

In [0]:
dense_vector_udf = udf(dense_vector, VectorUDT())
training = df_doc_vec.withColumn("features", dense_vector_udf(col("doc_vector")))

In [0]:
training.count()
#train, test = training.randomSplit([0.80, 0.20], seed=123)
x, train,test = stratified_split(training)
print('Train dataset count:', train.count())
print('Test dataset count:', test.count())

original : zeroes count & one count :  951 48 0.050473186119873815
percentage -  original insincere qs - 5.047318611987381
percentage -  train insincere qs - 5.019815059445178
+------+-----+
|target|count|
+------+-----+
|     0|  757|
|     1|   38|
+------+-----+

percentage -  test insincere qs - 5.154639175257732
+------+-----+
|target|count|
+------+-----+
|     0|  194|
|     1|   10|
+------+-----+

percentage - stratified train insincere qs - 3.8095238095238093
Train dataset count: 795
Test dataset count: 204


In [0]:
lr = LogisticRegression(labelCol="target", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrParisModel = lr.fit(train)

In [0]:
#print(f"Training Area Under ROC: {lrParisModel.summary.areaUnderROC}")
print(f"Training Accuracy: {lrParisModel.summary.accuracy}")
pred = lrParisModel.evaluate(test)

#print(f"Test Area Under ROC: {pred.areaUnderROC}")
print(f"Test Accuracy: {pred.accuracy}")

Training Area Under ROC: 0.5
Training Accuracy: 0.9522012578616352
Test Accuracy: 0.9509803921568627


In [0]:
train_predictions = lrParisModel.transform(train)
test_predictions = lrParisModel.transform(test)

In [0]:
train_predictions
#display(test_predictions.select("prediction"))
#display(test_predictions)

Out[20]: DataFrame[qid: string, question_text: string, target: int, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, doc_vector: array<double>, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [0]:
display(test_predictions.select("prediction", "rawPrediction", "probability"), "ROC")

prediction,rawPrediction,probability
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"
0.0,"Map(vectorType -> dense, length -> 2, values -> List(2.991777093711063, -2.991777093711063))","Map(vectorType -> dense, length -> 2, values -> List(0.9522012578616352, 0.04779874213836477))"


In [0]:
display(lrParisModel.stages[-1], test_predictions.drop("prediction", "rawPrediction", "probability"), "ROC")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
[0;32m<command-3555915791024754>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mdisplay[0m[0;34m([0m[0mlrParisModel[0m[0;34m.[0m[0mstages[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m,[0m [0mtest_predictions[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m"prediction"[0m[0;34m,[0m [0;34m"rawPrediction"[0m[0;34m,[0m [0;34m"probability"[0m[0;34m)[0m[0;34m,[0m [0;34m"ROC"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;31mAttributeError[0m: 'LogisticRegressionModel' object has no attribute 'stages'

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
 
bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print(f"Area under ROC curve: {bcEvaluator.evaluate(test_predictions)}")
 




In [0]:
#display(lrParisModel.stages[-1], test_predictions.drop("prediction", "rawPrediction", "probability"), "ROC")
display(test_predictions.drop("prediction", "rawPrediction", "probability"), "ROC")
 
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
 
bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
#print(f"Area under ROC curve: {bcEvaluator.evaluate(test_predictions)}")
 
mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(test_predictions)}")

