Permalink
Browse files

Moving files over naively from local project. Previous commits didn't…

… have useful messages.
  • Loading branch information...
Zack Maril
Zack Maril committed Jul 11, 2014
1 parent 17582bf commit 2005c1264d2aa3f61383a0faa056e25817cddc96
Showing with 409 additions and 3 deletions.
  1. +10 −0 .gitignore
  2. +13 −3 README.md
  3. +11 −0 project.clj
  4. +115 −0 src/echelon/core.clj
  5. +70 −0 src/echelon/load.clj
  6. +15 −0 src/echelon/parser.bnf
  7. +65 −0 src/echelon/schema.clj
  8. +54 −0 src/echelon/schema.edn
  9. +33 −0 src/echelon/text.clj
  10. +16 −0 src/echelon/util.clj
  11. +7 −0 test/echelon/core_test.clj
View
@@ -0,0 +1,10 @@
+/target
+/classes
+/checkouts
+pom.xml
+pom.xml.asc
+*.jar
+*.class
+/.lein-*
+/.nrepl-port
+/output
View
@@ -1,4 +1,14 @@
-echelon
-=======
+# echelon
-library for ingesting and storing itemized data. also reconciles entities and assigns unique IDs.
+A Clojure library designed to ... well, that part is up to you.
+
+## Usage
+
+FIXME
+
+## License
+
+Copyright © 2014 FIXME
+
+Distributed under the Eclipse Public License either version 1.0 or (at
+your option) any later version.
View
@@ -0,0 +1,11 @@
+(defproject echelon "0.1.0-SNAPSHOT"
+ :description "FIXME: write description"
+ :url "http://example.com/FIXME"
+ :license {:name "Eclipse Public License"
+ :url "http://www.eclipse.org/legal/epl-v10.html"}
+ :dependencies [[org.clojure/clojure "1.6.0"]
+ [com.datomic/datomic-free "0.9.4815.12"]
+ [org.clojure/data.json "0.2.5"]
+ [me.raynes/fs "1.4.4"]
+ [instaparse "1.3.2"]]
+ :main echelon.core)
View
@@ -0,0 +1,115 @@
+(ns echelon.core
+ (:require [datomic.api :as d :refer [db q]]
+ [clojure.pprint :refer [pprint]]
+ [echelon.load :refer [load-database!]]
+ [echelon.text :refer [extract-names]]
+ [echelon.util :refer [group-by-features]]))
+
+(def uri "datomic:free://localhost:4334/echelon")
+
+(defn how-many?
+ "How many beings are there?"
+ [dbc]
+ (let [f #(-> (d/q '[:find (count ?being)
+ :in $ ?type
+ :where
+ [?r :being/type ?type]
+ [?r :being/represents ?being]]
+ dbc
+ %)
+ ffirst)]
+ {:clients (f :being.type/:client)
+ :firms (f :being.type/:firm)}))
+
+
+(defn merges-for-beings [dbc [b1 & b2s]]
+ (let [records (map
+ #(d/q '[:find ?record
+ :in $ ?being
+ :where [?record :being/represents ?being]]
+ dbc
+ %)
+ b2s)
+ adds (mapv #(vector :db/add (ffirst %) :being/represents b1)
+ records)
+ retracts (map #(vector :db.fn/retractEntity %) b2s)
+ data (vec (concat adds retracts))]
+ data))
+
+(def rules '[[(name-of ?record ?name) [?record :client/name ?name]]
+ [(name-of ?record ?name) [?record :firm/name ?name]]])
+
+(defn beings-and-names [dbc]
+ (d/q '[:find ?being ?name :in $ %
+ :where
+ [?being :being/type :being.type/:being]
+ [?record :being/represents ?being]
+ (name-of ?record ?name)]
+ dbc
+ rules))
+
+(defn merges-based-on-exact-name [dbc]
+ (println "Merging based on exact names")
+ (let [dbc
+ (->> (beings-and-names dbc)
+ (group-by second)
+ seq
+ (map second)
+ (filter #(< 1 (count %)))
+ (map (partial map first))
+ (mapcat (partial merges-for-beings dbc))
+ (d/with dbc)
+ :db-after)]
+ (println (how-many? dbc))
+ dbc))
+
+(defn merges-based-on-extracted-name [dbc]
+ (println "Merging based on extracted names")
+ (let [dbc
+ (->> (beings-and-names dbc)
+ (group-by-features (comp extract-names second))
+ seq
+ (map second)
+ (filter #(< 1 (count %)))
+ (map (partial map first))
+ (mapcat (partial merges-for-beings dbc))
+ (d/with dbc)
+ :db-after)]
+ (println (how-many? dbc))
+ dbc))
+
+(defn -main [arg]
+ (condp = arg
+ "load"
+ (do
+ (d/delete-database uri)
+ (d/create-database uri)
+ (def c (d/connect uri))
+ (println "Loading Database...")
+ (load-database! c)
+ (println "Loaded!")
+ (println (how-many? (db c)))
+ (java.lang.System/exit 0))
+ "match"
+ (do
+ (as-> (db (d/connect uri)) hypothetical
+ (merges-based-on-exact-name hypothetical)
+ (merges-based-on-extracted-name hypothetical)
+ (->> (d/q '[:find ?being ?name
+ :in $ %
+ :with ?record
+ :where
+ [?being :being/type :being.type/:being]
+ [?record :being/represents ?being]
+ (name-of ?record ?name)]
+ hypothetical
+ rules)
+ distinct
+ (group-by first)
+ seq
+ (map (fn [[b bs]] [b (map second bs)]))
+ (sort-by (comp first second))
+ pprint
+ with-out-str
+ (spit "names-output.clj")))
+ (java.lang.System/exit 0))))
View
@@ -0,0 +1,70 @@
+(ns echelon.load
+ (:require [datomic.api :as d :refer [db q]]
+ [clojure.data.json :as json]
+ [echelon.text :refer [clean]]
+ [me.raynes.fs :as fs]
+ [clojure.pprint :refer [pprint]]))
+
+(def datadir "/home/zmaril/data/original/sopr_html/")
+
+(defn list-ld1-forms []
+ (mapcat #(fs/glob (str datadir "/" % "/REG/*"))
+ (range 2008 2015)))
+
+(defn list-ld2-forms []
+ (for [y (range 2004 2015) q (range 1 5)]
+ (fs/glob (str datadir "LD2/" y "/Q" q "/*"))))
+
+
+(def counter (atom 0))
+
+(defn ld1-datoms [m]
+ (let [client-name (some-> m :client :client_name clean)
+ firm-name (some-> m :registrant :registrant_name clean)
+ document-id (some-> m :document_id)
+ blank? (some nil? [client-name firm-name document-id])]
+ (when blank?
+ (swap! counter inc))
+ (if blank?
+ []
+ [{:db/id #db/id[:db.part/user -1]
+ :being/type :being.type/:being}
+ {:db/id #db/id[:db.part/user -2]
+ :being/type :being.type/:client
+ :client/name client-name
+ :being/represents #db/id[:db.part/user -1]}
+ {:db/id #db/id[:db.part/user -3]
+ :being/type :being.type/:being}
+ {:db/id #db/id[:db.part/user -4]
+ :being/type :being.type/:firm
+ :firm/name firm-name
+ :being/represents #db/id[:db.part/user -3]}
+ {:db/id #db/id[:db.part/tx -1]
+ :data/document-id document-id
+ :data/source :data.source/sopr-html}])))
+
+
+(defn load-data! [conn]
+ (->> (list-ld1-forms)
+ (map (comp
+ (partial d/transact conn)
+ ld1-datoms
+ #(json/read-str % :key-fn keyword)
+ slurp))
+ doall)
+ (comment
+ (->> (list-ld2-forms)
+ (filter (complement nil?))
+ (apply concat)
+ (pmap (comp (partial add-ld2-form! conn) json/read-str slurp))
+ doall)))
+
+(defn load-schema! [conn]
+ (d/transact conn (read-string (slurp "src/echelon/schema.edn"))))
+
+(defn load-database! [conn]
+ (println "Schema loading...")
+ (load-schema! conn)
+ (println "Data loading...")
+ (load-data! conn)
+ (println (str "There are " @counter " blank forms")))
View
@@ -0,0 +1,15 @@
+names = name (<whitespace>+ <splitters> <whitespace>+ name)*
+name = token (<whitespace>* | <whitespace>+ <corporates> | <whitespace>+ token)*
+<token> = !(corporates | splitters) #'[a-z0-9]+'
+whitespace = ' ' | ',' | '-' | '(' | ')'
+corporates = "llc" | "llp" | "inc" | "lc" | "pllc" | "ltd" | "lp" | "pllp" | "company" | "corps" | "corporations" | "corporation" | "corp" | "companies" | "incorporated" | "corporation"
+splitters = fka | full
+fka = "fka" | !"//" "f//k//a" | "f//k//a//"
+full = "formerly known as" | !" known" "formely"
+
+(*splitters = "formerly known as" | !" known" "formerly"*)
+ (*"formerly know as" | "frmly filed as" | "frmly registered
+ as" | "frmly" | "frly" | "frmly field" | "formerly filed as" |
+ "formerly reported as" | "formerly" | "formally known as" |
+ "also known as" | "formally" | "former" | "d|b|a" | "dba" | *)
+
View
@@ -0,0 +1,65 @@
+(ns schema.clj)
+
+(defn enum [key] {:db/id #db/id[:db.part/user] :db/ident key})
+
+(defn string-prop [prop doc]
+ {:db/id #db/id[:db.part/db]
+ :db/ident prop
+ :db/valueType :db.type/string
+ :db/cardinality :db.cardinality/one
+ :db/doc doc
+ :db.install/_attribute :db.part/db})
+
+(defn ref-prop [prop doc]
+ {:db/id #db/id[:db.part/db]
+ :db/ident prop
+ :db/valueType :db.type/ref
+ :db/cardinality :db.cardinality/one
+ :db/doc doc
+ :db.install/_attribute :db.part/db})
+
+(defn component-prop [prop doc]
+ {:db/id #db/id[:db.part/db]
+ :db/ident prop
+ :db/valueType :db.type/ref
+ :db/cardinality :db.cardinality/one
+ :db/doc doc
+ :db.install/_attribute :db.part/db})
+
+(def schema
+ [(ref-prop :data/source "Where the data came from.")
+ (enum :data.source/sopr-html)
+ (string-prop :data/document-id "Id of a document.")
+
+ (ref-prop :record/type "A record's type.")
+ (enum :record.type/:being)
+ (enum :lobbying.type/:activity)
+ (enum :lobbying.type/:affiliated-organization)
+ (enum :lobbying.type/:client)
+ (enum :lobbying.type/:contact)
+ (enum :lobbying.type/:foreign-entity)
+ (enum :lobbying.type/:individual)
+ (enum :lobbying.type/:lobbyist)
+ (enum :lobbying.type/:registrant)
+
+ (ref-prop :record/represents
+ "Indicates that the record entity with this attribute
+ represents a being. This should be the only property that
+ will ever be overwritten.")
+
+ (string-prop :lobbying.organization/name "An organization client's name.")
+
+ (component-prop :lobbying/main-address "Main address for reaching")
+ (component-prop
+ :/principal-place-of-business
+ "Legal term for the primary location where a taxpayers's business
+ is performed (wording taken from bit.ly/1s3ZbG7)")
+ (string-prop :address/first-line "First line ofbr an address")
+ (string-prop :address/second-line "Second line for an address")
+ (string-prop :address/zipcode "Zipcode for an address") ;;TODO: ref prop
+ (string-prop :address/state "State for an address") ;;TODO: ref prop
+ (string-prop :address/country "Country for an address") ;;TODO: ref prop
+
+
+
+ ])
View
@@ -0,0 +1,54 @@
+[{:db/id #db/id[:db.part/db]
+ :db/ident :firm/name
+ :db/valueType :db.type/string
+ :db/cardinality :db.cardinality/one
+ :db/doc "A client's name."
+ :db.install/_attribute :db.part/db
+ :db/index true}
+
+ {:db/id #db/id[:db.part/db]
+ :db/ident :client/name
+ :db/valueType :db.type/string
+ :db/cardinality :db.cardinality/one
+ :db/doc "A firm's name."
+ :db.install/_attribute :db.part/db
+ :db/index true}
+
+ {:db/id #db/id[:db.part/db]
+ :db/ident :data/document-id
+ :db/valueType :db.type/string
+ :db/cardinality :db.cardinality/one
+ :db/doc "Id of a document."
+ :db.install/_attribute :db.part/db
+ :db/index true}
+
+ {:db/id #db/id[:db.part/db]
+ :db/ident :data/source
+ :db/valueType :db.type/ref
+ :db/cardinality :db.cardinality/one
+ :db/doc "Where the data came from."
+ :db.install/_attribute :db.part/db}
+
+ {:db/id #db/id[:db.part/user]
+ :db/ident :data.source/sopr-html}
+
+ {:db/id #db/id[:db.part/db]
+ :db/ident :being/type
+ :db/valueType :db.type/ref
+ :db/cardinality :db.cardinality/one
+ :db/doc "A being's type."
+ :db.install/_attribute :db.part/db}
+
+ {:db/id #db/id[:db.part/db]
+ :db/ident :being/represents
+ :db/valueType :db.type/ref
+ :db/cardinality :db.cardinality/one
+ :db/doc "A being is represented by a record."
+ :db.install/_attribute :db.part/db}
+
+ {:db/id #db/id[:db.part/user]
+ :db/ident :being.type/:firm}
+ {:db/id #db/id[:db.part/user]
+ :db/ident :being.type/:client}
+ {:db/id #db/id[:db.part/user]
+ :db/ident :being.type/:being}]
View
@@ -0,0 +1,33 @@
+(ns echelon.text
+ (:require [instaparse.core :as insta]
+ [clojure.string :as s]))
+
+(def single-parse
+ (insta/parser (slurp "src/echelon/parser.bnf")))
+
+(def all-parses
+ (partial insta/parses single-parse))
+
+(defn transform [t]
+ (insta/transform
+ {:name (fn [& ts] (s/join " " ts))
+ :names vector}
+ t))
+
+(defn clean [x] (-> x s/lower-case (s/replace "." "") s/trim))
+
+(defn extract-names [x] (-> x clean all-parses transform vec))
+
+(comment
+ (insta/parses
+ single-parse
+ (clean "AK Steel Holding Corporation (Formerly Known As AK Steel)")
+ :partial true
+ :unhide :all)
+ (insta/parse
+ single-parse
+ (clean "AK Steel Holding Corporation (Formerly Known As AK Steel)"))
+ (insta/parses
+ single-parse
+ (clean "Bolton-St. Johns, LLC (f/k/a Bolton-St. Johns, Inc.)")
+ :partial true :unhide :all))
Oops, something went wrong.

0 comments on commit 2005c12

Please sign in to comment.