In [1]:
using Distances, Statistics
using MultivariateStats
using PyPlot
using WordTokenizers
using TextAnalysis
using DelimitedFiles

In [17]:
function load_embeddings(embedding_file)
    local LL,indexed_words, index
    indexed_words = Vector{String}()
    LL = Vector{Vector{Float64}}()
    open(embedding_file) do f
        index = 1
        for line in eachline(f)
            xs=split(line)
            word = xs[1]
            push!(indexed_words, word)
            push!(LL, parse.(Float64,xs[2:end]))
            index += 1
        end
    end
    return reduce(hcat,LL), indexed_words
end

load_embeddings (generic function with 1 method)

In [18]:
embeddings,vocab = load_embeddings("C:/Users/vivek/Downloads/glove.6B/glove.6B.100d.txt")
vec_size, vocab_size = size(embeddings)
println("Loaded embeddings, each word is represented by a vector with $vec_size features. The vocab size is $vocab_size")


Loaded embeddings, each word is represented by a vector with 50 features. The vocab size is 400000


In [13]:
vec_idx(s)=findfirst(x -> x==s, vocab)
vec_idx("cheese")

5796

In [14]:
embeddings,vocab = load_embeddings("C:/Users/vivek/Downloads/glove.6B/glove.6B.100d.txt")
vec_size, vocab_size = size(embeddings)
println("Loaded embeddings, each word is represented by a vector with $vec_size features. The vocab size is $vocab_size")


Loaded embeddings, each word is represented by a vector with 100 features. The vocab size is 400000


In [15]:
vec_idx(s)=findfirst(x -> x==s, vocab)
vec_idx("altcar")

295248

In [21]:
function vec(s)
    if vec_idx(s)!=nothing
        embeddings[:,vec_idx(s)]
    end
end
vec("nongame")

50-element Array{Float64,1}:
  0.35286
 -0.23699
 -0.77357
  0.74114
 -0.012065
 -0.36192
 -0.39925
 -0.27894
  1.7881
 -0.26777
 -0.36849
  0.11557
  1.3973
  ⋮
 -0.86715
  0.10879
 -0.58138
 -0.58457
  0.60059
  0.67574
  0.40177
 -0.19689
  0.2758
  0.22438
 -0.027037
  0.52948

In [22]:
cosine(x,y)=1+cosine_dist(x,y)

cosine (generic function with 1 method)

In [24]:
cosine(vec("xykon"), vec("nagor")) < cosine(vec("reller"), vec("isdr"))

true

In [32]:
function closest(v, n=20)
    list=[(x,cosine(embeddings'[x,:],v)) for x in 1:size(embeddings)[2]]
    topn_idx=sort(list, by=x -> x[2], rev=true)[1:n]
    return [vocab[a] for(a,_) in topn_idx]
end

closest (generic function with 2 methods)

In [33]:
closest(vec("wine")) 

20-element Array{String,1}:
 "petrovs"
 "blatnik"
 "muruli"
 "nobuyasu"
 "anielewicz"
 "nguon"
 "gcsb"
 "ōhashi"
 "aiz"
 "chans"
 "i-695"
 "polevoy"
 "skeer"
 "pennybacker"
 "alparslan"
 "takahiro"
 "knab"
 "maheswaran"
 "beetham"
 "woodall"

In [112]:
closest(vec("man")-vec("woman")+vec("queen"))

20-element Array{String,1}:
 "relatedly"
 "sagiv"
 "meawhile"
 "metabolomics"
 "ilpo"
 "renos"
 "jirapan"
 "linowes"
 "4,835"
 "miccio"
 "fleek"
 "mullainathan"
 "saxenian"
 "nedeljkovic"
 "nannetti"
 "3,134"
 "1,854"
 "3,068"
 "teleworking"
 "korhonen"

LoadError: UndefVarError: embeddings_files not defined

In [85]:
txt = open("C:/Users/vivek/Downloads/glove.6B/stormoflondon.txt") do file 
    read(file, String)
end
println("Loaded Storm Of London, length=$(length(txt)) characters")

Loaded Storm Of London, length=432205 characters


In [90]:
using WordTokenizers, TextAnalysis

function getsentences(txt)
    txt= replace(txt, r"\n|\r|_|,"=>"")
    txt= replace(txt, r"[\"*();!]"=>"")
    sd=StringDocument(txt)
    prepare!(sd, strip_whitespace)
    sentences = WordTokenizers.split_sentences(sd.text)
    i=1
    for s in 1:length(sentences)
        if lenght(split(sentences[s]))>3
            sentences[i]=lowercase(replace(sentences[s], "."=>""))
            i+1
        end
    end
sentences[1000:1010]
end


getsentences (generic function with 1 method)

LoadError: MethodError: no method matching similar(::Int64, ::Type{Any})
Closest candidates are:
  similar(!Matched::ZMQ.Message, ::Type{T}, !Matched::Tuple{Vararg{Int64,N}} where N) where T at C:\Users\vivek\.julia\packages\ZMQ\R3wSD\src\message.jl:93
  similar(!Matched::Array{T,1}, ::Type) where T at array.jl:377
  similar(!Matched::Array{T,2}, ::Type) where T at array.jl:378
  ...

In [107]:
function sentvec(s)
    local arr=[64]
    for w in split(sentences[s])
        if vec(w)!=nothing
            push!(arr, vec(w))
        end
    end
    if lenght(arr)==0
        ones(Float64, (50,1))*900
    else
        mean(arr)
    end
end

sentvec (generic function with 1 method)

In [108]:
sentences[20]

"“A daring theme treated with admirable discretion."

In [109]:
sentvec(1000)

LoadError: MethodError: Cannot `convert` an object of type Array{Float64,1} to an object of type Int64
Closest candidates are:
  convert(::Type{T}, !Matched::T) where T<:Number at number.jl:6
  convert(::Type{T}, !Matched::Number) where T<:Number at number.jl:7
  convert(::Type{T}, !Matched::Ptr) where T<:Integer at pointer.jl:23
  ...

In [110]:
function closest_sent(input_str, n=200)
    mean_vec_input=mean([vec(w) for w in split(input_str)])
    list=[(x,cosine(mean_vec_input, sentvec(x))) for x in 1:lenght(sentence)]
    topn_idx=sort(list, by = x -> x[2], rev=true)[1:n]
    return [sentences[a] for (a,_) in topn_idx]
end


closest_sent (generic function with 2 methods)

In [111]:
closest_sent("he very soon realised that the")

LoadError: UndefVarError: lenght not defined