In [1]:
import json
import numpy as np
import collections
import copy
from os import listdir
from os.path import isfile, join

In [2]:
import findspark
findspark.init()
from pyspark import SparkContext
import pyspark
conf = pyspark.SparkConf().setAll([('spark.executor.memory', '8g'), ('spark.executor.cores', '2'),('spark.executor.instances','7'), ('spark.driver.memory','32g'), ('spark.driver.maxResultSize','10g')])
sc = SparkContext(conf=conf)

In [3]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark.sql.types import Row
from pyspark.sql import SparkSession
spark = SparkSession(sc)

In [4]:
def convert_ndarray_back(x):
    x['entityCell'] = np.array(x['entityCell'])
    return x
data_dir = "../../data/"
train_tables = sc.textFile(data_dir+"train_tables.jsonl").map(lambda x:convert_ndarray_back(json.loads(x.strip())))

In [45]:
def get_core_entity_caption_label(x):
    core_entities = set()
    for i,j in zip(*x['entityCell'].nonzero()):
        if j==0 and j in x['entityColumn']:
            core_entities.add(x['tableData'][i][j]['surfaceLinks'][0]['target']['id'])
    return list(core_entities), x["_id"], x['tableCaption'], x["processed_tableHeaders"][0]

In [16]:
from operator import add

In [46]:
table_rdd = train_tables.map(get_core_entity_caption_label)
entity_rdd = table_rdd.flatMap(lambda x:[(z,x[1],x[2],x[3]) for z in x[0]])

In [43]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [47]:
table_df = spark.createDataFrame(table_rdd,["entities","table_id","caption","header"])

In [54]:
caption_tokenizer = Tokenizer(inputCol="caption", outputCol="caption_term")
header_tokenizer = Tokenizer(inputCol="header", outputCol="header_term")
list_stopwords = StopWordsRemover.loadDefaultStopWords("english")
caption_remover = StopWordsRemover(inputCol="caption_term", outputCol="caption_term_cleaned")
header_remover = StopWordsRemover(inputCol="header_term", outputCol="header_term_cleaned")

In [87]:
list_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [77]:
table_df_tokenizered = header_remover.transform(\
                            header_tokenizer.transform(\
                                caption_remover.transform(\
                                    caption_tokenizer.transform(table_df)))).select("entities","table_id","caption_term_cleaned","header_term_cleaned","header")

In [78]:
table_df_tokenizered.show()

+--------------------+----------+--------------------+--------------------+--------------------+
|            entities|  table_id|caption_term_cleaned| header_term_cleaned|              header|
+--------------------+----------+--------------------+--------------------+--------------------+
|          [27282555]|27281853-1|        [references]| [military, offices]|    military offices|
|   [450099, 1702543]|   27282-1|[main, office, ho...|            [office]|              office|
|  [23867939, 429187]|27282227-1|   [primate, poland]|[catholic, church...|catholic church t...|
|          [27283377]|27282555-1|        [references]| [military, offices]|    military offices|
|          [22583176]|27282731-3|   [external, links]|      [achievements]|        achievements|
|[2086865, 2172188...|27283077-1|     [qualification]|           [country]|             country|
|[1019331, 4019429...|27283077-2|            [venues]|        [gothenburg]|          gothenburg|
|           [4061083]|27283077

In [79]:
caption_term_freq = table_df_tokenizered.select("caption_term_cleaned").rdd \
                        .flatMap(lambda x:[(z,1) for z in x["caption_term_cleaned"]])\
                        .reduceByKey(add).collect()
header_term_freq = table_df_tokenizered.select("header_term_cleaned").rdd \
                        .flatMap(lambda x:[(z,1) for z in x["header_term_cleaned"]])\
                        .reduceByKey(add).collect()
header_freq = table_df_tokenizered.select("header").rdd \
                        .map(lambda x:(x["header"],1))\
                        .reduceByKey(add).collect()

In [80]:
len(header_freq)

20415

In [81]:
entity_df = table_df_tokenizered.select(F.explode("entities").alias("entity"), "table_id","caption_term_cleaned","header_term_cleaned","header")

In [82]:
entity_caption_term_freq = entity_df.select("entity", "caption_term_cleaned").rdd \
                                .flatMap(lambda x:[((x["entity"],z),1) for z in x["caption_term_cleaned"]])\
                                .reduceByKey(add)\
                                .map(lambda x:(x[0][0], [(x[0][1],x[1])]))\
                                .reduceByKey(add).collect()
entity_header_term_freq = entity_df.select("entity", "header_term_cleaned").rdd \
                                .flatMap(lambda x:[((x["entity"],z),1) for z in x["header_term_cleaned"]])\
                                .reduceByKey(add)\
                                .map(lambda x:(x[0][0], [(x[0][1],x[1])]))\
                                .reduceByKey(add).collect()
entity_header_freq = entity_df.select("entity", "header").rdd \
                                .map(lambda x:((x["entity"],x["header"]),1))\
                                .reduceByKey(add)\
                                .map(lambda x:(x[0][0], [(x[0][1],x[1])]))\
                                .reduceByKey(add).collect()

In [84]:
entity_tables = entity_df.select("entity","table_id")\
                    .groupBy("entity").agg(F.collect_list("table_id").alias("tables"))\
                    .rdd.map(lambda x:(x['entity'],x['tables'])).collect()

In [100]:
import pickle

In [105]:
with open("../../data/entity_tables.pkl","wb") as f:
    pickle.dump(entity_tables, f)

In [103]:
for e in entity_header_freq:
    entity_header_freq[e] = [sum([count for _,count in entity_header_freq[e]]),dict(entity_header_freq[e])]

with open("../../data/entity_header_freq.pkl","wb") as f:
    pickle.dump(entity_header_freq, f)

In [106]:
entity_header_term_freq = dict(entity_header_term_freq)
for e in entity_header_term_freq:
    entity_header_term_freq[e] = [sum([count for _,count in entity_header_term_freq[e]]),dict(entity_header_term_freq[e])]

with open("../../data/entity_header_term_freq.pkl","wb") as f:
    pickle.dump(entity_header_term_freq, f)

In [107]:
entity_caption_term_freq = dict(entity_caption_term_freq)
for e in entity_caption_term_freq:
    entity_caption_term_freq[e] = [sum([count for _,count in entity_caption_term_freq[e]]),dict(entity_caption_term_freq[e])]

with open("../../data/entity_caption_term_freq.pkl","wb") as f:
    pickle.dump(entity_caption_term_freq, f)

In [109]:
caption_term_freq = dict(caption_term_freq)
with open("../../data/caption_term_freq.pkl","wb") as f:
    pickle.dump([sum([count for _,count in caption_term_freq.items()]),caption_term_freq], f)
    
header_term_freq = dict(header_term_freq)
with open("../../data/header_term_freq.pkl","wb") as f:
    pickle.dump([sum([count for _,count in header_term_freq.items()]),header_term_freq], f)
    
header_freq = dict(header_freq)
with open("../../data/header_freq.pkl","wb") as f:
    pickle.dump([sum([count for _,count in header_freq.items()]),header_freq], f)

In [99]:
for e in entity_tables:
    if len(entity_tables[e]) != sum([count for _,count in entity_header_freq[e]]):
        print(e, len(entity_tables[e]), sum([count for _,count in entity_header_freq[e]]))
        break

In [108]:
caption_term_freq[0]

('references', 35319)

In [102]:
entity_header_freq[1677]

[7,
 {'titles in pretence': 1,
  'descendent': 2,
  'image': 1,
  'political offices': 1,
  'name of descendant': 1,
  'name': 1}]

In [113]:
entity_rdd.filter(lambda x:x[0]==5839439).take(10)

[(5839439, '39405618-3', 'Other', 'record name')]

In [1]:
from tqdm import tqdm_notebook