In [None]:
# Create Spark Context
from pyspark import SparkContext
sc = SparkContext()

# Set file
logFile = "/home/jovyan/work/data/airflow.cfg"

# Read file
logData = sc.textFile(logFile).cache()

# Get lines with A
numAs = logData.filter(lambda s: 'a' in s).count()

# Get lines with B 
numBs = logData.filter(lambda s: 'b' in s).count()

# Print result
print("Lines with a: {}, lines with b: {}".format(numAs, numBs))

# NLP Spark

In [None]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [None]:
spark = sparknlp.start()

In [None]:
MODEL_NAME='classifierdl_use_emotion'

In [None]:
text_list = [
    """600,000 flood victims have been shifted to camps: Murad""",
            """COAS hails US support for flood victims""",
            """My soul has just been pierced by the most evil look from @rickosborneorg. A mini panic attack &amp; chill in bones followed soon after.""",
            """Breach in Mirpur-Kotli highway on periphery of Mangla lake: officials""",
            """Two dead in SNGPL pipeline blast""",
            """Tire erupts at MQM’s former headquarters Nine Zero""",
            """A woman was gang raped by three persons including two policemen in Chuhng area""",
            """Man commits suicide after killing two women""",
            """Edhi Orange Line to be inaugurated today""",
            ]


In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)
df = spark.createDataFrame(pd.DataFrame({"text":text_list}))
result = pipelineModel.transform(df)

In [None]:
result

In [None]:
result.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).show(truncate=False)

In [None]:
model_age_news = "bert_sequence_classifier_age_news"
model_hatexplain = "bert_sequence_classifier_hatexplain"
model_emotion = "bert_sequence_classifier_emotion"

In [None]:
text_age_news = ["""600,000 flood victims have been shifted to camps: Murad""",
            """COAS hails US support for flood victims""",
            """My soul has just been pierced by the most evil look from @rickosborneorg. A mini panic attack &amp; chill in bones followed soon after.""",
            """Breach in Mirpur-Kotli highway on periphery of Mangla lake: officials""",
            """Two dead in SNGPL pipeline blast""",
            """Tire erupts at MQM’s former headquarters Nine Zero""",
            """A woman was gang raped by three persons including two policemen in Chuhng area""",
            """Man commits suicide after killing two women""",
            """Edhi Orange Line to be inaugurated today""",]

In [None]:
text_hatexplain = ["""600,000 flood victims have been shifted to camps: Murad""",
            """COAS hails US support for flood victims""",
            """My soul has just been pierced by the most evil look from @rickosborneorg. A mini panic attack &amp; chill in bones followed soon after.""",
            """Breach in Mirpur-Kotli highway on periphery of Mangla lake: officials""",
            """Two dead in SNGPL pipeline blast""",
            """Tire erupts at MQM’s former headquarters Nine Zero""",
            """A woman was gang raped by three persons including two policemen in Chuhng area""",
            """Man commits suicide after killing two women""",
            """Edhi Orange Line to be inaugurated today""",]

In [None]:
text_emotion = ["""600,000 flood victims have been shifted to camps: Murad""",
            """COAS hails US support for flood victims""",
            """My soul has just been pierced by the most evil look from @rickosborneorg. A mini panic attack &amp; chill in bones followed soon after.""",
            """Breach in Mirpur-Kotli highway on periphery of Mangla lake: officials""",
            """Two dead in SNGPL pipeline blast""",
            """Tire erupts at MQM’s former headquarters Nine Zero""",
            """A woman was gang raped by three persons including two policemen in Chuhng area""",
            """Man commits suicide after killing two women""",
            """Edhi Orange Line to be inaugurated today""",]

In [None]:
model_dict = {
              # model_age_news :text_age_news,
              # model_hatexplain: text_hatexplain,
              model_emotion: text_emotion
}

In [None]:
from pyspark.sql.types import StringType, IntegerType

def run_pipeline(model, text, results):  
  document_assembler = DocumentAssembler() \
      .setInputCol('text') \
      .setOutputCol('document')

  tokenizer = Tokenizer() \
      .setInputCols(['document']) \
      .setOutputCol('token')

  sequenceClassifier = BertForSequenceClassification\
        .pretrained(model, 'en') \
        .setInputCols(['token', 'document']) \
        .setOutputCol('pred_class')

  pipeline = Pipeline(stages=[document_assembler, tokenizer, sequenceClassifier])

  df = spark.createDataFrame(text, StringType()).toDF("text")
  results[model]=(pipeline.fit(df).transform(df))

In [None]:
results = {}
for model, text in zip(model_dict.keys(),model_dict.values()):
  run_pipeline(model, text, results)

In [None]:
for model_name, result in zip(results.keys(),results.values()):  
  res = result.select(F.explode(F.arrays_zip(result.document.result, 
                                                  result.pred_class.result,
                                                  result.pred_class.metadata)).alias("col"))\
                  .select(F.expr("col['1']").alias("prediction"),
                          F.expr("col['2']").alias("confidence"),
                          F.expr("col['0']").alias("sentence"))
                  
  if res.count()>1:
    udf_func = F.udf(lambda x,y:  x["Some("+str(y)+")"])
    print("\n",model_name,"\n") 
    res.withColumn('confidence', udf_func(res.confidence, res.prediction)).show(truncate=False)
    print("\n**********************************\n") 

# Entities

In [None]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [None]:
text_list = test_sentences = ["""Death toll reaches 1,136 across country as flood threat lingers in KP""",\
    """Real faces' exposed: Miftah calls for Jhagra to resign, Tarin to quit politics after audio leaks""",\
        """Gill open to issuing apology over controversial remarks, counsel tells Islamabad court in sedition case"""]

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols("document")\
  .setOutputCol("token")  

tokenClassifier = BertForTokenClassification.pretrained("bert_token_classifier_ner_btc", "en")\
  .setInputCols("token", "document")\
  .setOutputCol("ner")\
  .setCaseSensitive(True)

ner_converter = NerConverter()\
        .setInputCols(["document","token","ner"])\
        .setOutputCol("ner_chunk")\
        

pipeline =  Pipeline(stages=[document, tokenizer, tokenClassifier, ner_converter])



In [None]:
model = pipeline.fit(spark.createDataFrame(pd.DataFrame({'text': ['']})))

result = model.transform(spark.createDataFrame(pd.DataFrame({'text': text_list})))


In [None]:

result.select(F.explode(F.arrays_zip('document.result', 'ner_chunk.result',"ner_chunk.metadata")).alias("cols")) \
.select(
        F.expr("cols['1']").alias("chunk"),
        F.expr("cols['2'].entity").alias('result')).show(truncate=False)

In [None]:
from sparknlp_display import NerVisualizer

for i in range(len(text_list)):
  NerVisualizer().display(
      result = result.collect()[i],
      label_col = 'ner_chunk',
      document_col = 'document'
  )



In [None]:
sc = spark.sparkContext

# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# to read parquet file
df = sqlContext.read.parquet('../parc/')

df.createOrReplaceTempView("ParquetTable")
df.printSchema()

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols("document")\
  .setOutputCol("token")  

tokenClassifier = BertForTokenClassification.pretrained("bert_token_classifier_ner_btc", "en")\
  .setInputCols("token", "document")\
  .setOutputCol("ner")\
  .setCaseSensitive(True)

ner_converter = NerConverter()\
        .setInputCols(["document","token","ner"])\
        .setOutputCol("ner_chunk")\
        

pipeline =  Pipeline(stages=[document, tokenizer, tokenClassifier, ner_converter])



In [None]:
model = pipeline.fit(df)

result = model.transform(df)

In [None]:
from sparknlp_display import NerVisualizer

NerVisualizer().display(
    result = result.collect()[0],
    label_col = 'ner_chunk',
    document_col = 'document'
)


In [None]:
result.printSchema()

In [None]:

result.select(F.explode(F.arrays_zip('document.result', 'ner_chunk.result',"ner_chunk")).alias("cols")).show(truncate=False)

In [None]:
# result.select("ner_chunk.result","ner_chunk.metadata").show(truncate=False)
# result.select('text', F.explode('ner_chunk.metadata').alias('clean_text')).show(truncate=False)
result.select('token.result','ner.result').show()

In [None]:
import json
import numpy as np

from sparknlp_display import NerVisualizer
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline



In [None]:
spark = sparknlp.start()

In [None]:
import pandas as pd

In [None]:
model_list = ["nerdl_fewnerd_100d","nerdl_fewnerd_subentity_100d"]

In [None]:
for MODEL_NAME in model_list:
  documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

  sentenceDetector = SentenceDetector()\
        .setInputCols(["document"])\
        .setOutputCol("sentence")

  tokenizer = Tokenizer()\
        .setInputCols(["sentence"])\
        .setOutputCol("token")

  embeddings = WordEmbeddingsModel.pretrained("glove_100d", "en")\
            .setInputCols("sentence", "token") \
            .setOutputCol("embeddings")

  ner = NerDLModel.pretrained(MODEL_NAME)\
          .setInputCols(["sentence", "token", "embeddings"])\
          .setOutputCol("ner")

  ner_converter = NerConverter()\
      .setInputCols(['document', 'token', 'ner'])\
      .setOutputCol('ner_chunk')

  nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector,
        tokenizer,
        embeddings,
        ner,
        ner_converter])




In [None]:
text_list = [
    """Seven injured in Quetta grenade attack""",
    """Disaster orthodoxy in Pakistan — floods aren't God's fury, we're the culprits and inaction is our sin"""
]
empty_data = spark.createDataFrame([[""]]).toDF("text")

ner_model = nlpPipeline.fit(empty_data)

df = spark.createDataFrame(pd.DataFrame({'text': text_list}))

result = ner_model.transform(df)
print("<----------------- MODEL NAME:","\033[1m" + MODEL_NAME + "\033[0m"," ----------------- >")
NerVisualizer().display(
      result = result.collect()[1],
      label_col = 'ner_chunk',
      document_col = 'document'
)

# GeoCoding

In [1]:
from geopy.geocoders import Nominatim

In [16]:
geolocator = Nominatim(user_agent="ehsan")
location = geolocator.geocode("Islamabad")
# print(location == None)
# print(location.address)

# print([location.latitude, location.longitude])

print(location.raw)

{'place_id': 1274860, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'node', 'osm_id': 316440978, 'boundingbox': ['33.5338118', '33.8538118', '72.9051511', '73.2251511'], 'lat': '33.6938118', 'lon': '73.0651511', 'display_name': 'اسلام آباد, وفاقی دارالحکومت اسلام آباد, 44000, پاکستان', 'class': 'place', 'type': 'city', 'importance': 0.5841564917637788, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_place_city.p.20.png'}


In [15]:
import folium


map1 = folium.Map(
    location=[30.8091281, 73.4493301],
    tiles='cartodbpositron',
    zoom_start=14,
)
# df.apply(lambda row:folium.CircleMarker(location=[row["latitude"], row["longitude"]]).add_to(map1), axis=1)
map1

In [45]:
import snscrape.modules.twitter as sntwitter
import csv
maxTweets = 100

keyword = 'MinusOneNaManzoor'
#place = '5e02a0f0d91c76d2' #This geo_place string corresponds to İstanbul, Turkey on twitter.

#keyword = 'covid'
#place = '01fbe706f872cb32' This geo_place string corresponds to Washington DC on twitter.

#Open/create a file to append data to
csvFile = open('place_result.csv', 'a', newline='', encoding='utf8')

#Use csv writer
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['id','date','tweet',]) 

for i,tweet in enumerate(sntwitter.TwitterSearchScraper('MinusOneNaManzoor + since:2022-08-25 until:2022-09-10 -filter:links -filter:replies').get_items()):
        if i > maxTweets :
            break  
        csvWriter.writerow([tweet.id, tweet.date, tweet.content])
csvFile.close()

In [1]:
import json
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

22/09/11 13:48:44 WARN Utils: Your hostname, zainab-ThinkPad-T560 resolves to a loopback address: 127.0.1.1; using 192.168.178.29 instead (on interface wlp4s0)
22/09/11 13:48:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/zainab/miniconda3/envs/sparknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/zainab/.ivy2/cache
The jars for the packages stored in: /home/zainab/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d1fac87f-04e9-4030-bf0c-b95e7f2b1bdf;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.1.0 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in local-m2-cache
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlom

22/09/11 13:48:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/11 13:49:02 ERROR Inbox: Ignoring error
java.lang.NullPointerException
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$register(BlockManagerMasterEndpoint.scala:579)
	at org.apache.spark.storage.BlockManagerMasterEndpoint$$anonfun$receiveAndReply$1.applyOrElse(BlockManagerMasterEndpoint.scala:121)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:103)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)
	at org.apache.spark.rpc.netty.MessageLoop.org$apache$spark$rpc$netty$MessageLoop$$receiveLoop(MessageLoop.scala:75)
	at org.apache.spark.rpc.netty.MessageLoop$$anon$1.run(MessageLoop.scala:41)
	at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.

In [3]:
text = """جیل جانے کیلئے تیار ہوں، بیگ تیار کر رکھا ہے لیکن میری گرفتاری کے بعد جو کچھ بھی ہوگا اُس کا ذمہ دار میں نہیں ہوں گا، عمران خان"""


In [4]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

## More accurate Sentence Detection using Deep Learning
sentencerDL = SentenceDetectorDLModel()\
.pretrained("sentence_detector_dl", "xx")\
.setInputCols(["document"])\
.setOutputCol("sentences")

marian = MarianTransformer.pretrained("translate_ur_en", "xx")\
.setInputCols(["sentences"])\
.setOutputCol("translation")

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    sentencerDL, marian
])

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[ | ]sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
Download done! Loading the resource.
[ — ]

                                                                                

[ \ ]

2022-09-11 13:50:07.491093: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]
translate_ur_en download started this may take some time.
Approximate size to download 275.2 MB
[ | ]translate_ur_en download started this may take some time.
Approximate size to download 275.2 MB
[ / ]Download done! Loading the resource.

An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel.
: java.lang.NoSuchMethodException: org.apache.spark.ml.PipelineModel.<init>(java.lang.String)
	at java.base/java.lang.Class.getConstructor0(Class.java:3349)
	at java.base/java.lang.Class.getConstructor(Class.java:2151)
	at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:468)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:31)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:24)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:500)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.sca

Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel.
: java.lang.NoSuchMethodException: org.apache.spark.ml.PipelineModel.<init>(java.lang.String)
	at java.base/java.lang.Class.getConstructor0(Class.java:3349)
	at java.base/java.lang.Class.getConstructor(Class.java:2151)
	at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:468)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:31)
	at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:24)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:500)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:492)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.downloadModel(ResourceDownloader.scala:666)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel(ResourceDownloader.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [49]:
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = nlp_pipeline.fit(empty_df)
lmodel = LightPipeline(pipeline_model)
res = lmodel.fullAnnotate(text)

NameError: name 'nlp_pipeline' is not defined

In [None]:
print ('Original:', text, '\n\n')

print ('Translated:\n')
for sentence in res[0]['translation']:
  print (sentence.result)

In [11]:
from sparknlp.pretrained import PretrainedPipeline 
pipeline = PretrainedPipeline("translate_ur_en", lang = "xx") 
pipeline.fullAnnotate("جیل جانے کیلئے تیار ہوں، بیگ تیار کر رکھا ہے لیکن میری گرفتاری کے بعد جو کچھ بھی ہوگا اُس کا ذمہ دار میں نہیں ہوں گا، عمران خان")


translate_ur_en download started this may take some time.
Approx size to download 275.2 MB
[OK!]


{'document': ['جیل جانے کیلئے تیار ہوں، بیگ تیار کر رکھا ہے لیکن میری گرفتاری کے بعد جو کچھ بھی ہوگا اُس کا ذمہ دار میں نہیں ہوں گا، عمران خان'],
 'sentence': ['جیل جانے کیلئے تیار ہوں، بیگ تیار کر رکھا ہے لیکن میری گرفتاری کے بعد جو کچھ بھی ہوگا اُس کا ذمہ دار میں نہیں ہوں گا، عمران خان'],
 'translation': ["I'm ready to go to prison, and I'm ready to keep the bag, but I'm not responsible for what I'm going to do after my arrest, despite my"]}

In [13]:
pipeline.annotate("inus 1 formula is strictly rejected by people of Pakistan")

{'document': ['inus 1 formula is strictly rejected by people of Pakistan'],
 'sentence': ['inus 1 formula is strictly rejected by people of Pakistan'],
 'translation': ['Flystine Laun Floudi Launun Floudi Lapri bute']}

22/09/19 19:26:08 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 704306213 ms exceeds timeout 120000 ms
22/09/19 19:26:08 WARN SparkContext: Killing executors is not supported by current scheduler.


In [None]:
text = """تم سے سنبھالا نہیں جائے گا یہ شوق گلے پڑ جائے گا"""

empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = pipeline.fit(empty_df)
lmodel = LightPipeline(pipeline_model)
res = lmodel.fullAnnotate(text)



