In [None]:
# mount Google Drive if required, otherwise skip

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Display Colab instance system information

from psutil import *

print("System information:\n")
print("Number of CPUs: " + str(cpu_count()))
!lscpu | grep 'Model name'

RAM = !free -h --si | awk '/Mem:/{print $2}'
print(f"\nRAM: {RAM[0]}")

disk_space = !df -h / | awk '{print $4}'

print(f"\nAvailable disk space: {disk_space[1]}")

GPU_type = !nvidia-smi -L
print(f"\nGPU type: {GPU_type}")


System information:

Number of CPUs: 4
Model name:          Intel(R) Xeon(R) CPU @ 2.20GHz

RAM: 26G

Available disk space: 185G

GPU type: ["NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.", '']


In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-12-08 03:06:19--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-12-08 03:06:19--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-12-08 03:06:21--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [None]:
import json
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import sparknlp
from tqdm.notebook import tqdm

# change data source if not utilizing Google Colab
data_source = "/content/drive/MyDrive"

spark = sparknlp.start()

# Load in corpus and create spark dataframe.  Using text as dictionary key will 
# eliminate exact duplicates.

corpus = open(f"{data_source}/SIC-annotated.jsonl").read()
corpus_raw = [json.loads(line) for line in tqdm(corpus.splitlines())]

del corpus

SIC_columns = ["text", "code"]
SIC_dict = {}

for line in tqdm(corpus_raw):
    SIC_dict[line["text"]] = line["code"]

del corpus_raw
    
SIC_df = spark.createDataFrame(SIC_dict.items(), schema=SIC_columns)

del SIC_dict

print(f"{SIC_df.count()} entries")

unique = SIC_df.select("code").distinct().collect()
print(f"{len(unique)} unique SIC Codes.")

SIC_df.groupBy("code").count().show()

  0%|          | 0/9931865 [00:00<?, ?it/s]

  0%|          | 0/9931865 [00:00<?, ?it/s]

In [None]:
# encode labels


from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml import Pipeline

label_list = []

for item in unique:
  label_list.append(item["code"])

label_dict = {}

for index, label in enumerate(label_list):
  label_dict[label] = index

df1 = SIC_df.rdd.map(lambda x: 
                      (x.text, x.code, label_dict[x.code])
                      ).toDF(["text", "code"])

del SIC_df

df1 = df1.drop("_3")


train, test = df1.randomSplit(weights=[0.8, 0.2], seed=42)

del df1

In [None]:
# ClassifierDL

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("code")\
      .setMaxEpochs(10)\
      .setEnableOutputLogs(False)\
      .setVerbose(1)

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

clf_pipelineModel = clf_pipeline.fit(train)

preds = clf_pipelineModel.transform(test)
preds.select("text", "code", "class.result").show()

from sklearn.metrics import classification_report, accuracy_score
df = clf_pipelineModel.transform(test).select("text", "code", "class.result").toPandas()

df["result"] = df["result"].apply(lambda x: x[0])

print(classification_report(df.code, df.result))
print(accuracy_score(df.code, df.result))

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


----------------------------------------
Exception happened during processing of request fromINFO:py4j.java_gateway:Error while receiving.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
 ('127.0.0.1', 57520)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.7/

Py4JError: ignored