In [None]:
# mount Google Drive if required, otherwise skip

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Display Colab instance system information

from psutil import *

print("System information:\n")
print("Number of CPUs: " + str(cpu_count()))
!lscpu | grep 'Model name'

RAM = !free -h --si | awk '/Mem:/{print $2}'
print(f"\nRAM: {RAM[0]}")

disk_space = !df -h / | awk '{print $4}'

print(f"\nAvailable disk space: {disk_space[1]}")

GPU_type = !nvidia-smi -L
print(f"\nGPU type: {GPU_type}")


System information:

Number of CPUs: 4
Model name:          Intel(R) Xeon(R) CPU @ 2.20GHz

RAM: 26G

Available disk space: 125G

GPU type: ['GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-1c2379c1-a6d5-ae68-9e69-5d9720a4cdbd)']


In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-12-08 03:41:58--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-12-08 03:41:59--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-12-08 03:42:00--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:44

In [None]:
import json
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import sparknlp
from tqdm.notebook import tqdm

import logging
logger = logging.getLogger('my_logger')

logging.basicConfig(
    level=logging.INFO)


# change data source if not utilizing Google Colab
data_source = "/content/drive/MyDrive"

spark = sparknlp.start(gpu=True)

# Load in corpus and create spark dataframe.  Using text as dictionary key will 
# eliminate exact duplicates.

corpus = open(f"{data_source}/SIC-aug-list.jsonl").read()
corpus_raw = [json.loads(line) for line in tqdm(corpus.splitlines()[::10])]

del corpus

SIC_columns = ["text", "code"]
SIC_dict = {}

for line in tqdm(corpus_raw):
    SIC_dict[line["text"]] = line["code"]

del corpus_raw
    
SIC_df = spark.createDataFrame(SIC_dict.items(), schema=SIC_columns)

del SIC_dict

print(f"{SIC_df.count()} entries")

unique = SIC_df.select("code").distinct().collect()
print(f"{len(unique)} unique SIC Codes.")

SIC_df.groupBy("code").count().show()

  0%|          | 0/993187 [00:00<?, ?it/s]

  0%|          | 0/993187 [00:00<?, ?it/s]

992534 entries
71 unique SIC Codes.
+----+-----+
|code|count|
+----+-----+
|  07|13663|
|  51|13818|
|  54|13753|
|  15|13892|
|  29|14096|
|  42|13780|
|  73|10086|
|  87|13893|
|  64|13779|
|  30|13625|
|  34|14499|
|  59|13813|
|  01|13692|
|  28|13582|
|  22|13577|
|  35|13935|
|  52|13746|
|  16|13649|
|  47|13675|
|  99|13289|
+----+-----+
only showing top 20 rows



In [None]:
# encode labels


from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml import Pipeline

label_list = []

for item in unique:
  label_list.append(item["code"])

label_dict = {}

for index, label in enumerate(label_list):
  label_dict[label] = index

df1 = SIC_df.rdd.map(lambda x: 
                      (x.text, x.code, label_dict[x.code])
                      ).toDF(["text", "code"])

del SIC_df

df1 = df1.drop("_3")


train, test = df1.randomSplit(weights=[0.8, 0.2], seed=42)

del df1

In [None]:
# ClassifierDL with BERT

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("code")\
      .setMaxEpochs(10)\
      .setEnableOutputLogs(False)\
      .setVerbose(1)
      #.setOutputLogsPath('./content/drive/MyDrive/logs')

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

clf_pipelineModel = clf_pipeline.fit(train)

preds = clf_pipelineModel.transform(test)
preds.select("text", "code", "class.result").show()

from sklearn.metrics import classification_report, accuracy_score
df = clf_pipelineModel.transform(test).select("text", "code", "class.result").toPandas()

df["result"] = df["result"].apply(lambda x: x[0])

print(classification_report(df.code, df.result))
print(accuracy_score(df.code, df.result))

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
+--------------------+----+------+
|                text|code|result|
+--------------------+----+------+
|( " General Couns...|  13|  [83]|
|( $ 500, 000) ass...|  30|  [44]|
|( 1) (ace) Effect...|  34|  [46]|
|( 1) At any time ...|  26|  [34]|
|( 1) During the E...|  40|  [40]|
|( 1) General. The...|  35|  [10]|
|( 1) Maintain and...|  40|  [13]|
|( 1) The Administ...|  70|  [26]|
|( 1) The Company ...|  16|  [60]|
|( 1) The Incremen...|  26|  [34]|
|( 1) The Loanword...|  57|  [44]|
|( 1) The Tranche ...|  79|  [34]|
|( 1) The subordin...|  52|  [13]|
|( 1) with respect...|  57|  [61]|
|( 3. 12 ). Nether...|  49|  [34]|
|( 8. 02 (Viostero...|  54|  [60]|
|( A) Any Soul sha...|  30|  [61]|
|( A) Quick upon t...|  14|  [12]|
|( A) The entry of...|  20|  [12]|
|( A) Upon the san...|  60|  [13]|

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          01       0.00      0.00      0.00      2697
          02       0.42      0.91      0.57      2689
          07       0.00      0.00      0.00      2709
          08       0.36      0.94      0.52      2762
          09       0.33      0.79      0.47      2648
          10       0.04      0.20      0.07      2748
          12       0.03      0.09      0.05      2710
          13       0.04      0.22      0.07      3878
          14       0.00      0.00      0.00      2706
          15       0.00      0.00      0.00      2729
          16       0.00      0.00      0.00      2654
          17       0.00      0.00      0.00      2742
          20       0.00      0.00      0.00      2926
          21       0.00      0.00      0.00      2778
          22       0.06      0.01      0.02      2808
          23       0.00      0.00      0.00      2727
          24       0.00      0.00      0.00      2671
          25       0.00    