-
Notifications
You must be signed in to change notification settings - Fork 1
/
tableqa_finance_unsupervised.py
139 lines (105 loc) · 5.55 KB
/
tableqa_finance_unsupervised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# coding: utf-8
import inspect
import json
import os
import urllib.request
from functools import reduce
from glob import glob
from time import sleep
from urllib.parse import quote
import jieba
import numpy as np
import pandas as pd
import seaborn as sns
from icecream import ic
from snorkel.labeling import PandasLFApplier, labeling_function
from snorkel.labeling.model import LabelModel
pd.set_option("display.max_rows", 200)
def read_fkg(path):
req = [[]]
with open(path, "r") as f:
lines = f.readlines()
for l in lines:
if l.startswith("{"):
req.append([])
req[-1].append(l)
#return req
return list(map(lambda x: json.loads("".join(map(lambda y: y.replace("\n", "").strip(), x)).replace("\n", "").strip()), req[1:]))
def finance_tag_filter(tag):
if type(tag) != type(""):
return False
kws = ["股票"]
return True if sum(map(lambda kw: kw in tag, kws)) else False
def retrieve_finance_tag_df(json_files):
def produce_df(req):
return pd.DataFrame(list(map(lambda y: y["data"] ,filter(lambda x: x["data"] and type(x["data"]) == type({}), req))))[["entity", "tag"]].explode("tag")
word_tag_df = pd.concat(list(map(lambda p: produce_df(read_fkg(p)), json_files)), axis = 0).drop_duplicates()
tags = word_tag_df["tag"].drop_duplicates()
finance_tags = tags[tags.map(finance_tag_filter)].tolist()
return word_tag_df[word_tag_df["tag"].isin(finance_tags)]
def filter_high_evidence_func(df):
with_title_part = high_evidence_finance_df[~pd.isna(high_evidence_finance_df["title"])].copy()
without_title_part = high_evidence_finance_df[pd.isna(high_evidence_finance_df["title"])].copy()
need_words = ["股", "证券"]
without_title_part = without_title_part[without_title_part["header"].map(lambda h_list: sum(map(lambda h: sum(map(lambda w: w in h, need_words)), h_list))).astype(bool)]
return pd.concat([with_title_part, without_title_part], axis = 0)
total_tables = pd.read_csv("train_table_extract_with_topic.csv")
finance_word_tag_df = retrieve_finance_tag_df(glob("finance*.json"))
finance_word_tag_df["entity_clean"] = finance_word_tag_df["entity"].map(lambda x: x[:x.find("[")] if "[" in x else x)
all_finance_tags = finance_word_tag_df["tag"].unique().tolist()
finance_tag_entities_dict = dict(map(lambda t2: (t2[0], t2[1]["entity_clean"].tolist()) ,finance_word_tag_df.groupby("tag")))
total_tables["fkwds"] = total_tables["all_text_str_elements"].map(lambda x: sorted(set(filter(lambda y: y in finance_word_tag_df["entity_clean"].tolist(), x.split(" ")))))
# Define the label mappings for convenience
#### ABSTAIN can not guess if finance
#ABSTAIN = -1
NOT_FINANCE = 0
FINANCE = 1
#### labeling function all take series as input
@labeling_function()
def if_top_topic_id(x):
return FINANCE if x.loc["top_topic"] in [16.0, 11.0] else NOT_FINANCE
@labeling_function()
def if_max_topic_id(x):
return FINANCE if x.loc["max_topic"] in [16.0, 11.0] else NOT_FINANCE
@labeling_function()
def if_other_topic(x):
return NOT_FINANCE if "TOP" in (x.loc["title"] if type(x.loc["title"]) == type("") else "") else FINANCE
#### generate labeling func
for i, t2 in enumerate(sorted(finance_tag_entities_dict.items())):
func_index = i
tag, entities_list = t2
func_def_format = '''@labeling_function()\ndef if_tag_in_entities_{}(x):\n\treturn FINANCE if set(x.loc["all_text_str_elements"].split(" ")).intersection({}) else NOT_FINANCE '''
func_def = func_def_format.format(func_index, set(entities_list))
ic(func_def)
exec(func_def.strip())
if_tag_in_entities_funcs = list(filter(lambda x: callable(eval(x)) and x.startswith("if_tag_in_entities_") ,dir()))
#### if funcs are all labeling function needed.
if_funcs = list(filter(lambda x: callable(eval(x)) and x.startswith("if_") ,dir()))
#### try run them on first record
dict(map(lambda func_name: (func_name, eval(func_name)(total_tables.iloc[0])), if_funcs))
# Define the set of labeling functions (LFs)
lfs = list(map(eval, if_funcs))
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(total_tables)
total_tables_cp = total_tables.copy()
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
total_tables_cp["label"] = label_model.predict(L=L_train)
total_tables_cp["label"].value_counts()
label_fkwds_df = pd.DataFrame(total_tables_cp[["label", "fkwds"]].apply(lambda s: (s["label"], len(s["fkwds"])), axis = 1).values.tolist())
label_fkwds_df.columns = ["label", "fkwds_len"]
total_tables_cp["other_t"] = total_tables.apply(if_other_topic, axis = 1)
low_evidence_finance_df = total_tables_cp[(total_tables_cp["label"] == 1) & (total_tables_cp["other_t"] == 1)].copy()
#### low_part 房地产 和 其它部分("股" as finance)
#### high_part 金融 和 其它部分("票房" as non-finance)
low_part = low_evidence_finance_df[low_evidence_finance_df["fkwds"].map(len) == 1].copy()
high_part = low_evidence_finance_df[low_evidence_finance_df["fkwds"].map(len) > 1].copy()
high_evidence_finance_df = pd.concat([low_part[low_part["title"].map(lambda x: "股" in (x if type(x) == type("") else ""))],
high_part[high_part["title"].map(lambda x: "票房" not in (x if type(x) == type("") else ""))]
], axis = 0)
##### with some tiny noise not upto 10 as 房地产 建筑
##### use this as final require as finance
high_evidence_finance_df_filtered = filter_high_evidence_func(high_evidence_finance_df)
high_evidence_finance_df_filtered.to_csv("finance_table_desc.csv", index = False)