Skip to content

Commit

Permalink
Added compressed wordvectors
Browse files Browse the repository at this point in the history
  • Loading branch information
zgornel committed Jul 2, 2019
2 parents 039b48a + c1e1631 commit 6d9aeda
Show file tree
Hide file tree
Showing 14 changed files with 619 additions and 98 deletions.
42 changes: 37 additions & 5 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,23 @@ uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
version = "1.0.0"

[[CMake]]
deps = ["BinDeps", "Libdl", "Test"]
git-tree-sha1 = "6e39bef3cbb8321e8a464b18a5c20d7cef813938"
deps = ["BinDeps"]
git-tree-sha1 = "c67a8689dc5444adc5eb2be7d837100340ecba11"
uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a"
version = "1.1.1"
version = "1.1.2"

[[CMakeWrapper]]
deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Test"]
git-tree-sha1 = "16d4acb3d37dc05b714977ffefa8890843dc8985"
uuid = "d5fb7624-851a-54ee-a528-d3f3bac0b4a0"
version = "0.2.3"

[[Clustering]]
deps = ["Dates", "Distances", "LinearAlgebra", "NearestNeighbors", "Printf", "Random", "SparseArrays", "Statistics", "StatsBase", "Test"]
git-tree-sha1 = "5c4c26ffc9075f5a425efdc6e40c6f8f11fa52ec"
uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
version = "0.13.1"

[[CodecZlib]]
deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"]
git-tree-sha1 = "36bbf5374c661054d41410dc53ff752972583b9b"
Expand Down Expand Up @@ -77,6 +83,12 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"

[[Distances]]
deps = ["LinearAlgebra", "Printf", "Random", "Statistics", "Test"]
git-tree-sha1 = "a135c7c062023051953141da8437ed74f89d767a"
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
version = "0.8.0"

[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Expand Down Expand Up @@ -117,9 +129,9 @@ version = "0.4.2"

[[LibCURL]]
deps = ["BinaryProvider", "Libdl"]
git-tree-sha1 = "5ee138c679fa202ebe211b2683d1eee2a87b3dbe"
git-tree-sha1 = "fd5fc15f2a04608fe1435a769dbbfc7959ff1daa"
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.5.1"
version = "0.5.2"

[[LibExpat]]
deps = ["Compat"]
Expand Down Expand Up @@ -165,6 +177,12 @@ git-tree-sha1 = "cf1c990020bc4a52ff34ba2ee058b7cb677141f2"
uuid = "6f286f6a-111f-5878-ab1e-185364afe411"
version = "0.6.0"

[[NearestNeighbors]]
deps = ["Distances", "LinearAlgebra", "Mmap", "StaticArrays", "Test"]
git-tree-sha1 = "f47c5d97cf9a8caefa47e9fa9d99d8fda1a65154"
uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
version = "0.4.3"

[[OrderedCollections]]
deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
Expand All @@ -185,6 +203,14 @@ uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[[QuantizedArrays]]
deps = ["Clustering", "Distances", "InteractiveUtils", "LinearAlgebra", "StatsBase"]
git-tree-sha1 = "e835d80f52771e35bfd81c1c6f189c755f031518"
repo-rev = "master"
repo-url = "https://github.com/zgornel/QuantizedArrays.jl.git"
uuid = "a7db621c-8ce0-11e9-16a1-0f86dc86bd10"
version = "0.1.1"

[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
Expand Down Expand Up @@ -216,6 +242,12 @@ version = "0.3.1"
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.11.0"

[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand Down
14 changes: 13 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,21 @@ version = "0.1.0"

[deps]
ConceptnetNumberbatch = "6bdbf80b-0969-53f9-8443-f41591bd656e"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
QuantizedArrays = "a7db621c-8ce0-11e9-16a1-0f86dc86bd10"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Word2Vec = "c64b6f0f-98cd-51d1-af78-58ae84944834"

[compat]
julia = "1"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
![Alt text](https://github.com/zgornel/EmbeddingsAnalysis.jl/blob/master/docs/src/assets/logo.png)

A package for processing embeddings. At this point, only word embeddings are _de facto_ supported however other types (i.e. graph embeddings) could be used as well.
A package for processing embeddings. At this point, only word embeddings are _de facto_ supported.

[![License](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](LICENSE.md)
[![Build Status](https://travis-ci.org/zgornel/EmbeddingsAnalysis.jl.svg?branch=master)](https://travis-ci.org/zgornel/EmbeddingsAnalysis.jl)
Expand Down
5 changes: 0 additions & 5 deletions REQUIRE

This file was deleted.

3 changes: 2 additions & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ The package implements the following embeddings processing algorithms:
- [Artetxe et al. "Uncovering divergent linguistic information in word embeddings with lessons for intrinsic and extrinsic evaluation", 2018](https://arxiv.org/pdf/1809.02094.pdf)
- [Vikas Raunak "Simple and effective dimensionality reduction for word embeddings", NIPS 2017 Workshop](https://arxiv.org/abs/1708.03629)
and utilities:
- saving `WordVectors` objects to disk in either binary or text format
- word vector compression through `CompressedWordVectors` (uses [QuantizedArrays.jl](https://github.com/zgornel/QuantizedArrays.jl))
- saving `WordVectors`, `CompressedWordVectors` objects to disk in either binary or text format
- convert `ConceptNet` objects to `WordVectors` objects

## Installation
Expand Down
21 changes: 16 additions & 5 deletions src/EmbeddingsAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,28 @@ module EmbeddingsAnalysis
using Languages
using Word2Vec
using ConceptnetNumberbatch
using StatsBase
using MultivariateStats
using Distances
using QuantizedArrays

import Base: dump
import Base: size
import Word2Vec: analogy_words

export conceptnet2wv,
CompressedWordVectors,
compressedwordvectors,
compress,
analogy_words,
write2disk,
similarity_order,
pca_reduction

include("dump.jl")
include("conceptnet2wv.jl")
include("similarity_order.jl")
include("pca_reduction.jl")
include("defaults.jl") # defaults
include("conceptnet2wv.jl") # convert ConceptNet to WordVectors
include("cwv.jl") # CompressedWordVectors
include("write2disk.jl") # save WordVectors to disk
include("similarity_order.jl") # preprocess WordVectors
include("pca_reduction.jl") # preprocess/reduce dimensionality of WordVectors

end # module
Loading

0 comments on commit 6d9aeda

Please sign in to comment.