## Installation in databricks

On an existing cluster, add the following to the Advanced Options -> Spark tab:
``` 
  spark.kryoserializer.buffer.max 2000M
  spark.serializer org.apache.spark.serializer.KryoSerializer
```

In Libraries tab inside the cluster:
  * Install New -> PyPI -> spark-nlp==4.4.0 -> Install
  * Install New -> Maven -> Coordinates -> com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0 -> Install
  
Source - https://github.com/JohnSnowLabs/spark-nlp#databricks-cluster

In [0]:
%pip install sparknlp
%pip install mlflow
%pip install modelstore

Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import sparknlp
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, DoubleType, IntegerType
from sparknlp.annotator import *
from sparknlp.base import *
import pickle

In [0]:
sample_size=1000

data = spark.read.format('csv').option('header', True).load('dbfs:/FileStore/tables/train.csv').limit(sample_size)
data = data.withColumn("target", data["target"].cast(IntegerType()))
data.printSchema()
data.count()

root
 |-- qid: string (nullable = true)
 |-- question_text: string (nullable = true)
 |-- target: integer (nullable = true)

Out[2]: 1000

In [0]:
def stratified_split(data_df):

    zeros = data_df.select("*") \
        .where("target == '0'")
    ones = data_df.select("*") \
        .where("target == '1'")
    print("original : zeroes count & one count : ", zeros.count(), ones.count(), ones.count() / zeros.count())
    print('percentage of original insincere qs -> {}'.format(ones.count() * 100 / zeros.count()))
    # split datasets into training and testing
    train0, test0 = zeros.randomSplit([0.8, 0.2], seed=1234)
    train1, test1 = ones.randomSplit([0.8, 0.2], seed=1234)

    # stack datasets back together
    train = train0.union(train1)
    test = test0.union(test1)

    print('percentage of train insincere qs -> {}'.format(train1.count() * 100 / train0.count()))
    train.groupby("target").count().show()

    print('percentage of test insincere qs -> {}'.format(test1.count() * 100 / test0.count()))
    test.groupby("target").count().show()
    stratifiedTrainData = train.sample(False, (data_df.count()/10) / len(train.collect()))


    x_zeros = stratifiedTrainData.select("*") \
        .where("target == '0'")
    x_ones = stratifiedTrainData.select("*") \
        .where("target == '1'")

    print('percentage of stratified train insincere qs -> {}'.format(x_ones.count() * 100 / x_zeros.count()))
    stratifiedTrainData.select("*") \
        .where("target == '0'") \
        .show(5)

    stratifiedTrainData.select("*") \
        .where("target == '1'") \
        .show(5)

    return stratifiedTrainData, train, test

In [0]:
#### This function to be deleted" 
def load_data(data_df):

    

    train10000, train, test =stratified_split(data_df)

    #Just display for time being , delete in final version
    train10000.select("*") \
        .where("target == '0'") \
        .show(5)

    train10000.select("*") \
        .where("target == '1'") \
        .show(5)
    return train10000, train, test

In [0]:
#### This function to be deleted , not used any more
def process(data):

    #load_data(data)

    train10000, train, test = load_data()
    train_x = train10000.select("question_text")
    train_y = train10000.select("target")
    test_x = test.select("question_text")
    test_y = test.select("target")

    #predicted = word_embedding_model_logistic(train_x, train_y, test_x)
    #print ('prediction done')
    #print_performance_report(test_y, predicted, 'Logistic regression model With Glove embeddings')

In [0]:
document_assembler = DocumentAssembler().setInputCol("question_text") \
    .setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]) \
    .setOutputCol("token")


In [0]:
#takes a long time to download, so please be patient
embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx") \
    .setInputCols("document", "token") \
    .setOutputCol("embeddings")

glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[ | ][OK!]


In [0]:
nlpPipeline = Pipeline(stages=[document_assembler,
                               tokenizer,
                               embeddings])

df = nlpPipeline.fit(data).transform(data)

In [0]:
df.show(10)

+--------------------+--------------------+------+--------------------+--------------------+--------------------+
|                 qid|       question_text|target|            document|               token|          embeddings|
+--------------------+--------------------+------+--------------------+--------------------+--------------------+
|00002165364db923c7e6|How did Quebec na...|     0|[{document, 0, 71...|[{token, 0, 2, Ho...|[{word_embeddings...|
|000032939017120e6e44|Do you have an ad...|     0|[{document, 0, 80...|[{token, 0, 1, Do...|[{word_embeddings...|
|0000412ca6e4628ce2cf|Why does velocity...|     0|[{document, 0, 66...|[{token, 0, 2, Wh...|[{word_embeddings...|
|000042bf85aa498cd78e|How did Otto von ...|     0|[{document, 0, 56...|[{token, 0, 2, Ho...|[{word_embeddings...|
|0000455dfa3e01eae3af|Can I convert mon...|     0|[{document, 0, 76...|[{token, 0, 2, Ca...|[{word_embeddings...|
|00004f9a462a357c33be|Is Gaza slowly be...|     0|[{document, 0, 71...|[{token, 0, 1, Is

In [0]:
display(df.count())

1000

In [0]:
def avg_vectors(word_vectors):
    length = len(word_vectors[0]["embeddings"])
    avg_vec = [0] * length
    for vec in word_vectors:
        for i, x in enumerate(vec["embeddings"]):
            avg_vec[i] += x
        avg_vec[i] = avg_vec[i] / length
    return avg_vec

def dense_vector(vec):
    return Vectors.dense(vec)

In [0]:
# create a udf
avg_vectors_udf = udf(avg_vectors, ArrayType(DoubleType()))
df_doc_vec = df.withColumn("doc_vector", avg_vectors_udf(col("embeddings")))

In [0]:
df_doc_vec.show(10)

+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|                 qid|       question_text|target|            document|               token|          embeddings|          doc_vector|
+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|00002165364db923c7e6|How did Quebec na...|     0|[{document, 0, 71...|[{token, 0, 2, Ho...|[{word_embeddings...|[0.38199613126926...|
|000032939017120e6e44|Do you have an ad...|     0|[{document, 0, 80...|[{token, 0, 1, Do...|[{word_embeddings...|[-0.7888190010562...|
|0000412ca6e4628ce2cf|Why does velocity...|     0|[{document, 0, 66...|[{token, 0, 2, Wh...|[{word_embeddings...|[0.04380194842815...|
|000042bf85aa498cd78e|How did Otto von ...|     0|[{document, 0, 56...|[{token, 0, 2, Ho...|[{word_embeddings...|[1.12683903053402...|
|0000455dfa3e01eae3af|Can I convert mon...|     0|[{doc

In [0]:
dense_vector_udf = udf(dense_vector, VectorUDT())
training = df_doc_vec.withColumn("features", dense_vector_udf(col("doc_vector")))

In [0]:
training.count()
#train, test = training.randomSplit([0.80, 0.20], seed=123)
x, train,test = stratified_split(training)
print('Train dataset count:', train.count())
print('Test dataset count:', test.count())

original : zeroes count & one count :  951 48 0.050473186119873815
percentage of original insincere qs -> 5.047318611987381
percentage of train insincere qs -> 5.019815059445178
+------+-----+
|target|count|
+------+-----+
|     0|  757|
|     1|   38|
+------+-----+

percentage of test insincere qs -> 5.154639175257732
+------+-----+
|target|count|
+------+-----+
|     0|  194|
|     1|   10|
+------+-----+

percentage of stratified train insincere qs -> 4.25531914893617
+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 qid|       question_text|target|            document|               token|          embeddings|          doc_vector|            features|
+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|00004f9a462a357c33be|Is Gaza slowly be...|     0|[{doc

In [0]:
lr = LogisticRegression(labelCol="target", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train)
lrPreds = lrModel.transform(test)
lrPreds.select( 'target', 'rawPrediction', 'prediction', 'probability').show(10)
print(lrPreds.select(["target", "prediction"]))


+------+--------------------+----------+--------------------+
|target|       rawPrediction|prediction|         probability|
+------+--------------------+----------+--------------------+
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
|     0|[2.99177709371106...|       0.0|[0.95220125786163...|
+------+--------------------+----------+--------------------+
only showing top 10 rows

DataFrame[target: int, prediction: double]


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
lrEval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol = 'target')
print('Test Area Under ROC', lrEval.evaluate(lrPreds))

display(lrPreds.count())
lrPreds.groupby("target").count().show()
lrPreds.groupby("prediction").count().show()
display(lrPreds.select(["target", "prediction"]))
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction")

# Make predicitons
predictionAndTarget = lrPreds.select("target", "prediction")



Test Area Under ROC 0.5
204+------+-----+
|target|count|
+------+-----+
|     0|  194|
|     1|   10|
+------+-----+

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  204|
+----------+-----+



target,prediction
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0
0,0.0


In [0]:
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
auc = evaluatorMulti.evaluate(predictionAndTarget)
print(f"Test Accuracy: {acc}, f1 score: {f1}, auc: {auc}")


Test Accuracy: 0.9509803921568627, f1 score: 0.927086412454429, auc: 0.927086412454429


In [0]:
def get_dir_content(ls_path):
    for dir_path in dbutils.fs.ls(ls_path):
        if dir_path.isFile():
            yield dir_path.path
        elif dir_path.isDir() and ls_path != dir_path.path:
            yield from get_dir_content(dir_path.path)
    


In [0]:
filename = "/dbfs/FileStore/model/ait613_team3.pkl"
lrModel.write().overwrite().save(filename)
list(get_dir_content('/dbfs/FileStore/model'))

Out[20]: ['dbfs:/dbfs/FileStore/model/ait613_team3.pkl/data/_SUCCESS',
 'dbfs:/dbfs/FileStore/model/ait613_team3.pkl/data/_committed_883891958068055883',
 'dbfs:/dbfs/FileStore/model/ait613_team3.pkl/data/_started_883891958068055883',
 'dbfs:/dbfs/FileStore/model/ait613_team3.pkl/data/part-00000-tid-883891958068055883-07c8e40d-dcb3-457d-9554-f53be470f3f6-1905-1-c000.snappy.parquet',
 'dbfs:/dbfs/FileStore/model/ait613_team3.pkl/metadata/_SUCCESS',
 'dbfs:/dbfs/FileStore/model/ait613_team3.pkl/metadata/part-00000',
 'dbfs:/dbfs/FileStore/model/ait613_team31.pkl/data/_SUCCESS',
 'dbfs:/dbfs/FileStore/model/ait613_team31.pkl/data/_committed_2987796682858164094',
 'dbfs:/dbfs/FileStore/model/ait613_team31.pkl/data/_started_2987796682858164094',
 'dbfs:/dbfs/FileStore/model/ait613_team31.pkl/data/part-00000-tid-2987796682858164094-b283534c-9ae1-449a-af65-b3e77b9010e0-786-1-c000.snappy.parquet',
 'dbfs:/dbfs/FileStore/model/ait613_team31.pkl/metadata/_SUCCESS',
 'dbfs:/dbfs/FileStore/model/a

In [0]:
from modelstore import ModelStore
import boto3
ACCESS_ID= "AKIAVISDIFI7V6NXSJOT"
ACCESS_KEY="TQU0GsRX++fi9+VR+LR4gauBbffp18AFz1nPmRoS"
s3 = boto3.resource('s3',
         aws_access_key_id=ACCESS_ID,
         aws_secret_access_key= ACCESS_KEY)

# Create a model store that points to your s3 bucket
bucket_name = "ait614bucket"
modelstore = ModelStore.from_aws_s3(bucket_name)

# Upload your model
model_domain = "your-model-domain"
modelstore.sklearn.upload(model_domain, model=model)