Permalink
Browse files

wip

* Cleaning exact names before matching
* Passing the results of group-by-feature to disjoint-sets to prevent
  some race conditions.
  • Loading branch information...
1 parent 49d774a commit bd94b3ad29103a7a1badece891dc9e4110acf5db Zack Maril committed Jul 17, 2014
Showing with 25 additions and 5 deletions.
  1. +2 −1 project.clj
  2. +6 −3 src/echelon/core.clj
  3. +17 −1 src/echelon/util.clj
View
@@ -7,5 +7,6 @@
[com.datomic/datomic-free "0.9.4815.12"]
[org.clojure/data.json "0.2.5"]
[me.raynes/fs "1.4.4"]
- [instaparse "1.3.2"]]
+ [instaparse "1.3.2"]
+ [org.jordanlewis/data.union-find "0.1.0"]]
:main echelon.core)
View
@@ -2,8 +2,8 @@
(:require [datomic.api :as d :refer [db q]]
[clojure.pprint :refer [pprint]]
[echelon.load :refer [load-database!]]
- [echelon.text :refer [extract-names]]
- [echelon.util :refer [group-by-features]]))
+ [echelon.text :refer [extract-names clean]]
+ [echelon.util :refer [group-by-features disjoint-lists]]))
(def uri "datomic:free://localhost:4334/echelon")
@@ -55,7 +55,7 @@
(println "Merging based on exact names")
(let [dbc
(->> (beings-and-names dbc)
- (group-by second)
+ (group-by (comp clean second))
seq
(map second)
(filter #(< 1 (count %)))
@@ -75,6 +75,7 @@
(map second)
(filter #(< 1 (count %)))
(map (partial map first))
+ disjoint-lists
(mapcat (partial merges-for-beings dbc))
(d/with dbc)
:db-after)]
@@ -95,6 +96,8 @@
(println (how-many? (db c)))))
(defn match-data []
+ (println "Starting merge process")
+ (println (how-many? (db (d/connect uri))))
(as-> (db (d/connect uri)) hypothetical
(merges-based-on-exact-name hypothetical)
(merges-based-on-extracted-name hypothetical)
View
@@ -1,4 +1,6 @@
-(ns echelon.util)
+(ns echelon.util
+ (:require [jordanlewis.data.union-find :refer
+ [union-find union get-canonical]]))
(defn group-by-features
"Like group-by, but inserts values multiple times based on f
@@ -14,3 +16,17 @@
(f x)))
(transient {})
coll)))
+
+;(disjoint-sets [[1 2 3] [4 5 6] [10 12 13] [1 4 2] [2 3 4 5 6])
+(defn disjoint-lists [lsts]
+ "Given a list of lists, returns the disjoint sets formed by the
+ equivalence classes described by the arguments. "
+ (let [els (distinct (apply concat lsts))
+ uf (apply union-find els)
+ uf (reduce
+ (fn [uf [fst & rst]] (reduce #(union %1 fst %2) uf rst))
+ uf lsts)]
+ (->> (.elt-map uf)
+ keys
+ (group-by uf)
+ vals)))

0 comments on commit bd94b3a

Please sign in to comment.