In [417]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.types import *

from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession


from sparknlp.annotator import *
from pyspark.ml import Pipeline
#sc = SparkContext()
import sparknlp
spark = sparknlp.start()


In [418]:
sqlContext = SQLContext(spark)



In [419]:
rdd_l = spark.sparkContext.wholeTextFiles("./*.html")


In [420]:
df = rdd_l.toDF(schema=["filename", "text"]).select("text")

In [421]:
df.show(10)

+--------------------+
|                text|
+--------------------+
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
|<!DOCTYPE html><h...|
+--------------------+



In [422]:
rules = '''

\d+&\w+;\d+&\w+;Kč* 

'''
#\S*\d+\S* Kč* 
with open('regex_rules.txt', 'w') as f:
    
    f.write(rules)

In [423]:
RegexMatcher().extractParamMap()

{Param(parent='RegexMatcher_cb41761ed270', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='RegexMatcher_cb41761ed270', name='strategy', doc='MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE'): 'MATCH_ALL'}

In [424]:




documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("assembled")

regex_matcher = RegexMatcher()\
    .setInputCols('assembled')\
    .setStrategy("MATCH_ALL")\
    .setOutputCol("regex_matches")\
    .setExternalRules(path='./regex_rules.txt', delimiter=',')

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    regex_matcher
 ])

pipelineModel = nlpPipeline.fit(df)
#df = pipelineModel.transform(df)
#empty_df = spark.createDataFrame([['']]).toDF("text")

#pipelineModel = nlpPipeline.fit(empty_df)

match_df = pipelineModel.transform(df)

match_df.select('regex_matches.result').show(truncate=50)

+----------------------+
|                result|
+----------------------+
|[500&nbsp;000&nbsp;Kč]|
|[200&nbsp;000&nbsp;Kč]|
|[854&nbsp;900&nbsp;Kč]|
|[565&nbsp;000&nbsp;Kč]|
|[475&nbsp;000&nbsp;Kč]|
|[500&nbsp;000&nbsp;Kč]|
|[530&nbsp;000&nbsp;Kč]|
|[511&nbsp;000&nbsp;Kč]|
|[850&nbsp;000&nbsp;Kč]|
|[490&nbsp;000&nbsp;Kč]|
+----------------------+



In [462]:
rules2 = '''

\w*tion\-\w*ext\s+\w*ng\-\w*binding\"\>\w*............................


'''
#\w*tion\-\w*ext\s+\w*ng\-\w*binding\"\>\w*
#/<title>([^<]+)<\/title>/
#\w*tion\-\w*ext\s+\w*ng\-\w*binding\"\>\w+|\s.$\<\/
#\S*\d+\S* Kč*
#\d+&\w+;\d+&\w+;Kč* 
with open('regex_rules2.txt', 'w') as f:
    
    f.write(rules2)

In [463]:
RegexMatcher().extractParamMap()

documentAssembler2 = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("assembled")

regex_matcher2 = RegexMatcher()\
    .setInputCols('assembled')\
    .setStrategy("MATCH_ALL")\
    .setOutputCol("regex_address")\
    .setExternalRules(path='./regex_rules2.txt', delimiter=',')

nlpPipeline2 = Pipeline(stages=[
    documentAssembler2, 
    regex_matcher2
 ])

In [465]:
pipelineModel2 = nlpPipeline2.fit(df)

match_df2 = pipelineModel2.transform(df)

match_df2.select('regex_address.result').show(truncate=400)

+---------------------------------------------------------------+
|                                                         result|
+---------------------------------------------------------------+
|   [location-text ng-binding">Masná, Praha 1 - Staré Město</sp]|
|       [location-text ng-binding">Velvarská, Horoměřice</span>]|
|      [location-text ng-binding">Libušina, Karlovy Vary</span>]|
|    [location-text ng-binding">Pod Lipami, Praha 3 - Žižkov</s]|
|  [location-text ng-binding">Drnovská, Praha 6 - Ruzyně</span>]|
|      [location-text ng-binding">V Průhoně, Mukařov - Žernovka]|
|    [location-text ng-binding">Mladoboleslavská, Mělník</span>]|
| [location-text ng-binding">Grafická, Praha 5 - Smíchov</span>]|
|[location-text ng-binding">Sametová, Liberec - Liberec VI-Roch]|
|[location-text ng-binding">Jakubská, Praha 1 - Staré Město</sp]|
+---------------------------------------------------------------+



In [466]:
from pyspark.sql import functions as F

match_df2.select(F.explode('regex_address.result')).show(truncate=False)

+-------------------------------------------------------------+
|col                                                          |
+-------------------------------------------------------------+
|location-text ng-binding">Masná, Praha 1 - Staré Město</sp   |
|location-text ng-binding">Velvarská, Horoměřice</span>       |
|location-text ng-binding">Libušina, Karlovy Vary</span>      |
|location-text ng-binding">Pod Lipami, Praha 3 - Žižkov</s    |
|location-text ng-binding">Drnovská, Praha 6 - Ruzyně</span>  |
|location-text ng-binding">V Průhoně, Mukařov - Žernovka      |
|location-text ng-binding">Mladoboleslavská, Mělník</span>    |
|location-text ng-binding">Grafická, Praha 5 - Smíchov</span> |
|location-text ng-binding">Sametová, Liberec - Liberec VI-Roch|
|location-text ng-binding">Jakubská, Praha 1 - Staré Město</sp|
+-------------------------------------------------------------+



In [467]:
result_df2 = match_df2.select(F.explode('regex_address.result').alias('location-text ng-binding')).toPandas()
result_df2

Unnamed: 0,location-text ng-binding
0,"location-text ng-binding"">Masná, Praha 1 - Staré Město</sp"
1,"location-text ng-binding"">Velvarská, Horoměřice</span>"
2,"location-text ng-binding"">Libušina, Karlovy Vary</span>"
3,"location-text ng-binding"">Pod Lipami, Praha 3 - Žižkov</s"
4,"location-text ng-binding"">Drnovská, Praha 6 - Ruzyně</span>"
5,"location-text ng-binding"">V Průhoně, Mukařov - Žernovka"
6,"location-text ng-binding"">Mladoboleslavská, Mělník</span>"
7,"location-text ng-binding"">Grafická, Praha 5 - Smíchov</span>"
8,"location-text ng-binding"">Sametová, Liberec - Liberec VI-Roch"
9,"location-text ng-binding"">Jakubská, Praha 1 - Staré Město</sp"


In [468]:
rules3 = '''


<div\s\w*ss[^>]*(.+?)

'''
#<!--\s\w*eat.\s\w*[^</div](.+?)/\d
#(\<\w*div\s\w*lass\=\"\w*arams\s\w*lear\<)
#\<\w*div\s\w*lass\=\"\w*arams\s\w*lear..\n\<\!\-\-\s\w*eat\:
#\<\w*div\s\w*lass\=\"\w*arams\.*class="param-label ng-binding.*\n
#.*Eventname .*\n
#\<\w*div\s\w*lass\=\"\w*arams\s\w*lear..\n\<\!\-\-\s\w*eat
#\<\w*div\s\w*lass\=\"\w*arams\s\w*lear
#\w*tion\-\w*ext\s+\w*ng\-\w*binding\"\>\w*........................
#\w*tion\-\w*ext\s+\w*ng\-\w*binding\"\>\w*
#\w*tion\-\w*ext\s+\w*ng\-\w*binding\"\>\w*.........................
#/<title>([^<]+)<\/title>/
#\w*tion\-\w*ext\s+\w*ng\-\w*binding\"\>\w+|\s.$\<\/
#\S*\d+\S* Kč*
#\d+&\w+;\d+&\w+;Kč* 
with open('regex_rules3.txt', 'w') as f:
    
    f.write(rules3)

In [434]:
RegexMatcher().extractParamMap()

documentAssembler3 = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("assembled")

regex_matcher3 = RegexMatcher()\
    .setInputCols('assembled')\
    .setStrategy("MATCH_ALL")\
    .setOutputCol("regex_params")\
    .setExternalRules(path='./regex_rules3.txt', delimiter=',')

nlpPipeline3 = Pipeline(stages=[
    documentAssembler3, 
    regex_matcher3
 ])

In [435]:
#<div\s\w*[^>]*

In [436]:
pipelineModel3 = nlpPipeline3.fit(df)

match_df3 = pipelineModel3.transform(df)

match_df3.select('regex_params.result').show(truncate=80000)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

match_df.select('regex_matches.result')

In [437]:
match_df2.select('regex_address.result').show(truncate=False)

+------------------------------------------------------------+
|result                                                      |
+------------------------------------------------------------+
|[location-text ng-binding">Masná, Praha 1 - Staré Město<]   |
|[location-text ng-binding">Velvarská, Horoměřice</span>]    |
|[location-text ng-binding">Libušina, Karlovy Vary</span>]   |
|[location-text ng-binding">Pod Lipami, Praha 3 - Žižkov]    |
|[location-text ng-binding">Drnovská, Praha 6 - Ruzyně</span]|
|[location-text ng-binding">V Průhoně, Mukařov - Žerno]      |
|[location-text ng-binding">Mladoboleslavská, Mělník</span>] |
|[location-text ng-binding">Grafická, Praha 5 - Smíchov</spa]|
|[location-text ng-binding">Sametová, Liberec - Liberec VI-R]|
|[location-text ng-binding">Jakubská, Praha 1 - Staré Město<]|
+------------------------------------------------------------+



In [353]:
rdd_df_join = match_df.select('regex_matches.result').join(match_df2.select('regex_address.result'))

In [416]:
match_df_pd_cost = match_df.select('regex_matches.result').toPandas()
match_df_pd_address = match_df2.select('regex_address.result').toPandas()

AttributeError: 'PipelinedRDD' object has no attribute 'select'

In [369]:
match_df_pd

Unnamed: 0,result
0,[500&nbsp;000&nbsp;Kč]
1,[200&nbsp;000&nbsp;Kč]
2,[854&nbsp;900&nbsp;Kč]
3,[565&nbsp;000&nbsp;Kč]
4,[475&nbsp;000&nbsp;Kč]
5,[500&nbsp;000&nbsp;Kč]
6,[530&nbsp;000&nbsp;Kč]
7,[511&nbsp;000&nbsp;Kč]
8,[850&nbsp;000&nbsp;Kč]
9,[490&nbsp;000&nbsp;Kč]


In [382]:
pd.set_option('display.max_colwidth', None)

In [376]:
import pandas as pd

In [383]:
df_concat_cost_address = pd.concat([match_df_pd_cost,match_df_pd_address], axis=1)

In [384]:
df_concat_cost_address

Unnamed: 0,result,result.1
0,[500&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Masná, Praha 1 - Staré Město<]"
1,[200&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Velvarská, Horoměřice</span>]"
2,[854&nbsp;900&nbsp;Kč],"[location-text ng-binding"">Libušina, Karlovy Vary</span>]"
3,[565&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Pod Lipami, Praha 3 - Žižkov]"
4,[475&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Drnovská, Praha 6 - Ruzyně</span]"
5,[500&nbsp;000&nbsp;Kč],"[location-text ng-binding"">V Průhoně, Mukařov - Žerno]"
6,[530&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Mladoboleslavská, Mělník</span>]"
7,[511&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Grafická, Praha 5 - Smíchov</spa]"
8,[850&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Sametová, Liberec - Liberec VI-R]"
9,[490&nbsp;000&nbsp;Kč],"[location-text ng-binding"">Jakubská, Praha 1 - Staré Město<]"


In [385]:
rdd_temp =match_df.zip(match_df2)

AttributeError: 'DataFrame' object has no attribute 'zip'