refactor structure of io package

usc-isi-i2 · Mar 21, 2018 · da2991f · da2991f
1 parent 02fd47d
commit da2991f
Show file tree

Hide file tree

Showing 49 changed files with 82 additions and 201,612 deletions.
diff --git a/examples/basic/basic.py b/examples/basic/basic.py
@@ -0,0 +1,65 @@
+import os
+import rltk
+
+
+class Record1(rltk.Record):
+    @property
+    def id(self):
+        return self.raw_object['doc_id']
+
+    @property
+    def value(self):
+        return self.raw_object['doc_value']
+
+
+class Record2(rltk.Record):
+    @property
+    def id(self):
+        return self.raw_object['ident']
+
+    @property
+    def value(self):
+        v = self.raw_object.get('values', list())
+        return v[0] if len(v) > 0 else ''
+
+
+ds1 = rltk.Dataset(reader=rltk.CSVReader(filename='ds1.csv'), record_class=Record1, adapter=rltk.MemoryAdapter())
+ds1.build_index()
+ds2 = rltk.Dataset(reader=rltk.JsonLinesReader(filename='ds2.jl'), record_class=Record2, adapter=rltk.MemoryAdapter())
+ds2.build_index()
+
+# for r in ds1:
+#     print(r.id)
+# for r in ds2:
+#     print(r.id)
+# print(ds1.get_record('1').id)
+
+
+# blocking_file = '/path/to/blocks'
+# if not os.path.exists(blocking_file):
+#     # rltk.n_gram_blocking(
+#     #     iterator1=venture_it,
+#     #     tokens1=VentureRecord.assignee_token,
+#     #
+#     #     iterator2=patent_it,
+#     #     field2=PatentRecord.patent_name,
+#     #     n_size2=3,
+#     #
+#     #     output_filename='/path/to/blocks')
+#     rltk.inverted_index_blocking(
+#         iterator1=venture_it,
+#         # tokens1=VentureRecord.assignee_token,
+#         attribute1 = VentureRecord.assignee,
+#         iterator2=patent_it,
+#         tokens2=PatentRecord.patent_token,
+#
+#         output_filename='/path/to/blocks')
+#
+feature_vector = []
+pairs = rltk.get_record_pairs(ds1, ds2)  # same to without blocks
+# pairs = rltk.iterate_on_datasets(ds1, ds2, '/path/to/blocks', batch_size=1000000)
+for r1, r2 in pairs:
+    print(r1.id, r2.id)
+    # v1 = rltk.levenshtein_similarity(r1.value.lower(), r2.value.lower())
+    # print(v1)
+
diff --git a/examples/ex4/ex_data1.csv → examples/basic/ds1.csv b/examples/ex4/ex_data1.csv → examples/basic/ds1.csv
@@ -1,4 +1,4 @@
-id,value
+doc_id,doc_value
 1,hello
 2,world
 3,foo

diff --git a/examples/basic/ds2.jl b/examples/basic/ds2.jl
@@ -0,0 +1,5 @@
+{"ident": "a", "values":["a1"]}
+{"ident": "b", "values":["b1", "b2"]}
+{"ident": "c", "values":["c1"]}
+
+{"ident": "d"}
diff --git a/examples/ex1/df_corpus_1.txt b/examples/ex1/df_corpus_1.txt
diff --git a/examples/ex1/ex1.py b/examples/ex1/ex1.py
diff --git a/examples/ex1/jl_file_1.jsonl b/examples/ex1/jl_file_1.jsonl
diff --git a/examples/ex4/blocking.jsonl b/examples/ex4/blocking.jsonl
diff --git a/examples/ex4/ex4.py b/examples/ex4/ex4.py
diff --git a/examples/ex4/ex_data2.csv b/examples/ex4/ex_data2.csv
diff --git a/examples/ex4/feature.jsonl b/examples/ex4/feature.jsonl
diff --git a/examples/ex4/feature_config.json b/examples/ex4/feature_config.json
diff --git a/examples/ex4/label.jsonl b/examples/ex4/label.jsonl
diff --git a/examples/ex4/labeled_feature.jsonl b/examples/ex4/labeled_feature.jsonl
diff --git a/examples/ex4/predicted.jsonl b/examples/ex4/predicted.jsonl
diff --git a/examples/file_iterator/file_iter.py b/examples/file_iterator/file_iter.py
diff --git a/examples/file_iterator/file_iter_test.csv b/examples/file_iterator/file_iter_test.csv
diff --git a/examples/file_iterator/file_iter_test.jsonl b/examples/file_iterator/file_iter_test.jsonl
diff --git a/examples/file_iterator/file_iter_test.txt b/examples/file_iterator/file_iter_test.txt
diff --git a/examples/indexer/minhash_lsh_indexing.py b/examples/indexer/minhash_lsh_indexing.py
diff --git a/examples/indexer/qgram_indexing.py b/examples/indexer/qgram_indexing.py
diff --git a/examples/museum_gm/feature_config.json b/examples/museum_gm/feature_config.json
diff --git a/examples/museum_gm/feature_vector.jsonl b/examples/museum_gm/feature_vector.jsonl
diff --git a/examples/museum_gm/featurized.jsonl b/examples/museum_gm/featurized.jsonl
diff --git a/examples/museum_gm/gm.py b/examples/museum_gm/gm.py
diff --git a/examples/museum_gm/ground_truth.jsonl b/examples/museum_gm/ground_truth.jsonl