In [None]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start(gpu = True)

In [None]:
data = pd.read_csv("data/wikiner_dataset", sep='/n', header=None, index_col = False)
data.shape

In [None]:
phrases, ners = [], []
for d in data[0]:
    text = d.split()
    p, n = [], []
    for t in text:    
        word = t.split("|")
        p.append(word[0])
        n.append(word[2])
    phrases.append(' '.join(p))
    ners.append(n)

In [None]:
from pyspark.sql.types import StringType
df = spark.createDataFrame(phrases, StringType())
df.show()

In [None]:
# https://towardsdatascience.com/training-a-contextual-spell-checker-for-italian-language-66dda528e4bf
assembler = DocumentAssembler()\
     .setInputCol("value")\
     .setOutputCol("document")

tokenizer = RecursiveTokenizer()\
     .setInputCols("document")\
     .setOutputCol("token")

## setLanguageModelClasses 
## dependent on the vocabulary size, and the model will use it to control the factoring in the language model
spellChecker = ContextSpellCheckerApproach()\
    .setInputCols("token")\
    .setOutputCol("corrected")\
    .setLanguageModelClasses(1650)\
    .setWordMaxDistance(3)\
    .setEpochs(10)
    
pipeline = Pipeline(
 stages = [
    assembler,
    tokenizer,
    spellChecker
 ])
model = pipeline.fit(df)

In [None]:
model.annotate("J'habitte à Allee Alfred Rocheray")