New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

loading pretrained word embeddings #102

Closed
attardi opened this Issue May 16, 2016 · 5 comments

Comments

Projects
None yet
6 participants
@attardi

attardi commented May 16, 2016

parser_trainer has an option to load pretrained word embeddings.
What is the format the file to load?
Is there a converter from word2vec format to recordio?

@calberti

This comment has been minimized.

Show comment
Hide comment
@calberti

calberti May 16, 2016

Contributor

The format is recordio of syntaxnet.TokenEmbedding defined in syntaxnet/dictionary.proto.
We don't include a converter, but you could use something similar to the code below.

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Adapted from
//   https://groups.google.com/forum/#!topic/word2vec-toolkit/5Qh-x2O1lV4

#include <malloc.h>
#include <math.h>
#include <stdio.h>
#include <string.h>

#include "syntaxnet/dictionary.pb.h"
#include "tensorflow/core/lib/io/record_writer.h"
#include "tensorflow/core/platform/env.h"

const long long max_size = 2000;  // max length of strings
const long long max_w = 50;       // max length of vocabulary entries

int main(int argc, char **argv) {
  // Following is from word2vec/distance.c.
  FILE *f;
  char file_name[max_size];
  float len;
  long long words, size, a, b;
  char ch;
  float *M;
  char *vocab;
  if (argc < 3) {
    printf(
        "Usage: ./w2v2recordio <INPUT_FILE> <OUTPUT_FILE:>"
        "\nwhere INPUT_FILE contains word projections in BINARY FORMAT\n");
    return 0;
  }
  strcpy(file_name, argv[1]);
  f = fopen(file_name, "rb");
  if (f == NULL) {
    printf("Input file not found\n");
    return -1;
  }
  fscanf(f, "%lld", &words);
  fscanf(f, "%lld", &size);
  vocab = (char *)malloc((long long)words * max_w * sizeof(char));
  M = (float *)malloc((long long)words * (long long)size * sizeof(float));
  if (M == NULL) {
    printf("Cannot allocate memory: %lld MB    %lld  %lld\n",
           (long long)words * size * sizeof(float) / 1048576, words, size);
    return -1;
  }
  for (b = 0; b < words; b++) {
    fscanf(f, "%s%c", &vocab[b * max_w], &ch);
    for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
    len = 0;
    for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
    len = sqrt(len);
    for (a = 0; a < size; a++) M[a + b * size] /= len;
  }
  fclose(f);

  // This write to recordio.
  char out_file_name[max_size];
  strcpy(out_file_name, argv[2]);
  tensorflow::WritableFile *out_file;
  TF_CHECK_OK(
      tensorflow::Env::Default()->NewWritableFile(out_file_name, &out_file));
  std::unique_ptr<tensorflow::io::RecordWriter> writer(
      new tensorflow::io::RecordWriter(out_file));
  for (a = 0; a < words; a++) {
    syntaxnet::TokenEmbedding embedding;
    embedding.set_token(&vocab[a * max_w]);
    for (b = 0; b < size; b++) {
      embedding.mutable_vector()->add_values(M[a * size + b]);
    }
    TF_CHECK_OK(writer->WriteRecord(embedding.SerializeAsString()));
  }
  delete out_file;
  return 0;
}
Contributor

calberti commented May 16, 2016

The format is recordio of syntaxnet.TokenEmbedding defined in syntaxnet/dictionary.proto.
We don't include a converter, but you could use something similar to the code below.

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Adapted from
//   https://groups.google.com/forum/#!topic/word2vec-toolkit/5Qh-x2O1lV4

#include <malloc.h>
#include <math.h>
#include <stdio.h>
#include <string.h>

#include "syntaxnet/dictionary.pb.h"
#include "tensorflow/core/lib/io/record_writer.h"
#include "tensorflow/core/platform/env.h"

const long long max_size = 2000;  // max length of strings
const long long max_w = 50;       // max length of vocabulary entries

int main(int argc, char **argv) {
  // Following is from word2vec/distance.c.
  FILE *f;
  char file_name[max_size];
  float len;
  long long words, size, a, b;
  char ch;
  float *M;
  char *vocab;
  if (argc < 3) {
    printf(
        "Usage: ./w2v2recordio <INPUT_FILE> <OUTPUT_FILE:>"
        "\nwhere INPUT_FILE contains word projections in BINARY FORMAT\n");
    return 0;
  }
  strcpy(file_name, argv[1]);
  f = fopen(file_name, "rb");
  if (f == NULL) {
    printf("Input file not found\n");
    return -1;
  }
  fscanf(f, "%lld", &words);
  fscanf(f, "%lld", &size);
  vocab = (char *)malloc((long long)words * max_w * sizeof(char));
  M = (float *)malloc((long long)words * (long long)size * sizeof(float));
  if (M == NULL) {
    printf("Cannot allocate memory: %lld MB    %lld  %lld\n",
           (long long)words * size * sizeof(float) / 1048576, words, size);
    return -1;
  }
  for (b = 0; b < words; b++) {
    fscanf(f, "%s%c", &vocab[b * max_w], &ch);
    for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
    len = 0;
    for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
    len = sqrt(len);
    for (a = 0; a < size; a++) M[a + b * size] /= len;
  }
  fclose(f);

  // This write to recordio.
  char out_file_name[max_size];
  strcpy(out_file_name, argv[2]);
  tensorflow::WritableFile *out_file;
  TF_CHECK_OK(
      tensorflow::Env::Default()->NewWritableFile(out_file_name, &out_file));
  std::unique_ptr<tensorflow::io::RecordWriter> writer(
      new tensorflow::io::RecordWriter(out_file));
  for (a = 0; a < words; a++) {
    syntaxnet::TokenEmbedding embedding;
    embedding.set_token(&vocab[a * max_w]);
    for (b = 0; b < size; b++) {
      embedding.mutable_vector()->add_values(M[a * size + b]);
    }
    TF_CHECK_OK(writer->WriteRecord(embedding.SerializeAsString()));
  }
  delete out_file;
  return 0;
}
@brendano

This comment has been minimized.

Show comment
Hide comment
@brendano

brendano Mar 22, 2017

@calberti can you say a little about how to compile this program in the context of the bazel build system that's used for syntaxnet? For example, our system version of protobuf appears to not be compatible with this program; bazel downloaded and built a different versionthat tensorflow and/or syntaxnet seem happier to use, and presumably this program as well. I tried to find the versions of protobuf and tensorflow that bazel built, and use those for header and library paths (and copy and paste tons of gcc flags from a verbose-enabled bazel build log file), but it's a bit of reverse engineering.

If the answer is "the easiest thing to do is learn how to use bazel", that's fine (just hard, but fair enough...)

I realize this is the same question asked on #770 to which @aselle replied asking to go to stackoverflow. I didn't find any discussion there on how to do it.

brendano commented Mar 22, 2017

@calberti can you say a little about how to compile this program in the context of the bazel build system that's used for syntaxnet? For example, our system version of protobuf appears to not be compatible with this program; bazel downloaded and built a different versionthat tensorflow and/or syntaxnet seem happier to use, and presumably this program as well. I tried to find the versions of protobuf and tensorflow that bazel built, and use those for header and library paths (and copy and paste tons of gcc flags from a verbose-enabled bazel build log file), but it's a bit of reverse engineering.

If the answer is "the easiest thing to do is learn how to use bazel", that's fine (just hard, but fair enough...)

I realize this is the same question asked on #770 to which @aselle replied asking to go to stackoverflow. I didn't find any discussion there on how to do it.

@brendano

This comment has been minimized.

Show comment
Hide comment
@brendano

brendano Mar 22, 2017

Hm, adding this to syntaxnet/BUILD seems to allow bazel build converter to work

cc_binary(
name="converter",
srcs=["converter.cc"],
    deps = [
        ":dictionary_proto",
        "@org_tensorflow//tensorflow/core:framework",
        "@org_tensorflow//tensorflow/core:tensorflow",
        "@org_tensorflow//tensorflow/core/kernels:reader_ops",
    ],
)

brendano commented Mar 22, 2017

Hm, adding this to syntaxnet/BUILD seems to allow bazel build converter to work

cc_binary(
name="converter",
srcs=["converter.cc"],
    deps = [
        ":dictionary_proto",
        "@org_tensorflow//tensorflow/core:framework",
        "@org_tensorflow//tensorflow/core:tensorflow",
        "@org_tensorflow//tensorflow/core/kernels:reader_ops",
    ],
)
@aselle

This comment has been minimized.

Show comment
Hide comment
@aselle

aselle Mar 23, 2017

Member

@calberti, should we include this code into the code now that its written and we have a build rule?

Member

aselle commented Mar 23, 2017

@calberti, should we include this code into the code now that its written and we have a build rule?

@ducdauge

This comment has been minimized.

Show comment
Hide comment
@ducdauge

ducdauge Jul 23, 2017

The file fails to build with the following error message:

ERROR: /models/syntaxnet/syntaxnet/BUILD:507:1: C++ compilation of rule '//syntaxnet:converter' failed: gcc failed: error executing command /usr/bin/gcc -U_FORTIFY_SOURCE -fstack-protector -Wall -B/usr/bin -B/usr/bin -Wunused-but-set-parameter -Wno-free-nonheap-object -fno-omit-frame-pointer -g0 -O2 '-D_FORTIFY_SOURCE=1' -DNDEBUG ... (remaining 121 argument(s) skipped): com.google.devtools.build.lib.shell.BadExitStatusException: Process exited with status 1. In file included from external/org_tensorflow/tensorflow/core/lib/io/record_writer.h:19:0, from syntaxnet/converter.cc:24: syntaxnet/converter.cc: In function 'int main(int, char**)': syntaxnet/converter.cc:75:74: error: no matching function for call to 'tensorflow::Env::NewWritableFile(char [2000], tensorflow::WritableFile**)' tensorflow::Env::Default()->NewWritableFile(out_file_name, &out_file)); ^ external/org_tensorflow/tensorflow/core/lib/core/status.h:129:58: note: in definition of macro 'TF_CHECK_OK' while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \ ^ In file included from external/org_tensorflow/tensorflow/core/lib/io/zlib_outputbuffer.h:25:0, from external/org_tensorflow/tensorflow/core/lib/io/record_writer.h:23, from syntaxnet/converter.cc:24: external/org_tensorflow/tensorflow/core/platform/env.h:104:10: note: candidate: tensorflow::Status tensorflow::Env::NewWritableFile(const string&, std::unique_ptr<tensorflow::WritableFile>*) Status NewWritableFile(const string& fname, ^ external/org_tensorflow/tensorflow/core/platform/env.h:104:10: note: no known conversion for argument 2 from 'tensorflow::WritableFile**' to 'std::unique_ptr<tensorflow::WritableFile>*' In file included from external/org_tensorflow/tensorflow/core/lib/io/record_writer.h:19:0, from syntaxnet/converter.cc:24: external/org_tensorflow/tensorflow/core/lib/core/status.h:129:67: error: 'TfCheckOpHelper' was not declared in this scope while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \ ^ syntaxnet/converter.cc:74:3: note: in expansion of macro 'TF_CHECK_OK' TF_CHECK_OK( ^ external/org_tensorflow/tensorflow/core/lib/core/status.h:129:67: note: suggested alternative: while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \ ^ syntaxnet/converter.cc:74:3: note: in expansion of macro 'TF_CHECK_OK' TF_CHECK_OK( ^ external/org_tensorflow/tensorflow/core/lib/core/status.h:123:28: note: 'tensorflow::TfCheckOpHelper' inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v, ^ syntaxnet/converter.cc:51:28: warning: ignoring return value of 'int fscanf(FILE*, const char*, ...)', declared with attribute warn_unused_result [-Wunused-result] fscanf(f, "%lld", &words); ^ syntaxnet/converter.cc:52:27: warning: ignoring return value of 'int fscanf(FILE*, const char*, ...)', declared with attribute warn_unused_result [-Wunused-result] fscanf(f, "%lld", &size); ^ syntaxnet/converter.cc:61:46: warning: ignoring return value of 'int fscanf(FILE*, const char*, ...)', declared with attribute warn_unused_result [-Wunused-result] fscanf(f, "%s%c", &vocab[b * max_w], &ch); ^ syntaxnet/converter.cc:62:76: warning: ignoring return value of 'size_t fread(void*, size_t, size_t, FILE*)', declared with attribute warn_unused_result [-Wunused-result] for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); ^ Target //syntaxnet:converter failed to build Use --verbose_failures to see the command lines of failed build steps.

ducdauge commented Jul 23, 2017

The file fails to build with the following error message:

ERROR: /models/syntaxnet/syntaxnet/BUILD:507:1: C++ compilation of rule '//syntaxnet:converter' failed: gcc failed: error executing command /usr/bin/gcc -U_FORTIFY_SOURCE -fstack-protector -Wall -B/usr/bin -B/usr/bin -Wunused-but-set-parameter -Wno-free-nonheap-object -fno-omit-frame-pointer -g0 -O2 '-D_FORTIFY_SOURCE=1' -DNDEBUG ... (remaining 121 argument(s) skipped): com.google.devtools.build.lib.shell.BadExitStatusException: Process exited with status 1. In file included from external/org_tensorflow/tensorflow/core/lib/io/record_writer.h:19:0, from syntaxnet/converter.cc:24: syntaxnet/converter.cc: In function 'int main(int, char**)': syntaxnet/converter.cc:75:74: error: no matching function for call to 'tensorflow::Env::NewWritableFile(char [2000], tensorflow::WritableFile**)' tensorflow::Env::Default()->NewWritableFile(out_file_name, &out_file)); ^ external/org_tensorflow/tensorflow/core/lib/core/status.h:129:58: note: in definition of macro 'TF_CHECK_OK' while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \ ^ In file included from external/org_tensorflow/tensorflow/core/lib/io/zlib_outputbuffer.h:25:0, from external/org_tensorflow/tensorflow/core/lib/io/record_writer.h:23, from syntaxnet/converter.cc:24: external/org_tensorflow/tensorflow/core/platform/env.h:104:10: note: candidate: tensorflow::Status tensorflow::Env::NewWritableFile(const string&, std::unique_ptr<tensorflow::WritableFile>*) Status NewWritableFile(const string& fname, ^ external/org_tensorflow/tensorflow/core/platform/env.h:104:10: note: no known conversion for argument 2 from 'tensorflow::WritableFile**' to 'std::unique_ptr<tensorflow::WritableFile>*' In file included from external/org_tensorflow/tensorflow/core/lib/io/record_writer.h:19:0, from syntaxnet/converter.cc:24: external/org_tensorflow/tensorflow/core/lib/core/status.h:129:67: error: 'TfCheckOpHelper' was not declared in this scope while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \ ^ syntaxnet/converter.cc:74:3: note: in expansion of macro 'TF_CHECK_OK' TF_CHECK_OK( ^ external/org_tensorflow/tensorflow/core/lib/core/status.h:129:67: note: suggested alternative: while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \ ^ syntaxnet/converter.cc:74:3: note: in expansion of macro 'TF_CHECK_OK' TF_CHECK_OK( ^ external/org_tensorflow/tensorflow/core/lib/core/status.h:123:28: note: 'tensorflow::TfCheckOpHelper' inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v, ^ syntaxnet/converter.cc:51:28: warning: ignoring return value of 'int fscanf(FILE*, const char*, ...)', declared with attribute warn_unused_result [-Wunused-result] fscanf(f, "%lld", &words); ^ syntaxnet/converter.cc:52:27: warning: ignoring return value of 'int fscanf(FILE*, const char*, ...)', declared with attribute warn_unused_result [-Wunused-result] fscanf(f, "%lld", &size); ^ syntaxnet/converter.cc:61:46: warning: ignoring return value of 'int fscanf(FILE*, const char*, ...)', declared with attribute warn_unused_result [-Wunused-result] fscanf(f, "%s%c", &vocab[b * max_w], &ch); ^ syntaxnet/converter.cc:62:76: warning: ignoring return value of 'size_t fread(void*, size_t, size_t, FILE*)', declared with attribute warn_unused_result [-Wunused-result] for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); ^ Target //syntaxnet:converter failed to build Use --verbose_failures to see the command lines of failed build steps.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment