Skip to content

Commit

Permalink
Merge pull request #108 from wush978/dev/95
Browse files Browse the repository at this point in the history
resolve #95
  • Loading branch information
wush978 committed Sep 25, 2015
2 parents afa0da3 + ae532cb commit 8476ce4
Show file tree
Hide file tree
Showing 17 changed files with 340 additions and 46 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Expand Up @@ -15,6 +15,7 @@ before_install:
- curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
- chmod 755 ./travis-tool.sh
- ./travis-tool.sh bootstrap
- ./travis-tool.sh install_github qinwf/jiebaR
- ./travis-tool.sh install_github jimhester/robustr
- ./travis-tool.sh install_github jimhester/covr

Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Expand Up @@ -24,7 +24,7 @@ Imports:
digest(>= 0.6.8),
magrittr (>= 1.5)
LinkingTo: Rcpp, digest(>= 0.6.8), BH
Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown
Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, jiebaR(>= 0.5.1)
RcppModules: callback, split_callback
SystemRequirements: C++11
BugReports: https://github.com/wush978/FeatureHashing/issues
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Expand Up @@ -7,7 +7,9 @@ export(hash.size)
export(hashed.interaction.value)
export(hashed.model.matrix)
export(hashed.value)
export(init_jiebaR_callback)
export(intToRaw)
export(ls_special)
export(register_callback)
export(test_callback)
import(digest)
Expand All @@ -16,6 +18,7 @@ importFrom(Matrix,Diagonal)
importFrom(Matrix,colSums)
importFrom(Rcpp,cpp_object_initializer)
importFrom(Rcpp,loadModule)
importFrom(Rcpp,sourceCpp)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(methods,as)
Expand Down
44 changes: 42 additions & 2 deletions R/callback.R
Expand Up @@ -3,13 +3,24 @@
#'@title Register Special Function for Formula Interface
#'@param special string. The name which will be used in formula interface.
#'@param callback_generator function which will create a callback. Please see the details.
#'@examples
#'@details The callback_generator is a function whose first argument is the
#'input data and the other arguments could be used to initialize the callback
#'function properly. The result should be a Rcpp module which derives the
#'`CallbackFunctor` class. Please see the vignette for details.
#'register_callback("split", generate_split_callback)
register_callback <- function(special, callback_generator) {
.callback[[special]] <- callback
.callback[[special]] <- callback_generator
invisible(NULL)
}

#'@title List the Registered Specials
#'@return character vector. The specials which could be used in the
#'formula interface.
#'@export
ls_special <- function() {
ls(.callback)
}

#'@title Generate callback of split
#'@param input character vector. The input of split
#'@param delim string. \code{delim} will be used as delimiter for splitting
Expand All @@ -23,3 +34,32 @@ generate_split_callback <- function(input, delim = ",", type = c("existence", "c

.callback <- new.env()
.callback[["split"]] <- generate_split_callback

#'@title Initialize and register jiebaR to the formula interface
#'@details This function will register the callback of word segmentation
#'function provided by jiebaR to the formula interface.
#'For example, `~ jiebaR(...)` will use the feature of word segmentation
#'provided by jiebaR to segment a given column of the data.
#'The first argument of the jiebaR is a character which will be segmented.
#'The left arguments are the same as \code{\link[jiebaR]{worker}}. These
#'arguments will be used to initialize a jiebaR worker which will segment
#'the input data.
#'
#'@examples
#'\dontrun{
#'library(FeatureHashing)
#'init_jiebaR_callback()
#'m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df))
#'# the column `df$title` will be feed into `worker <- worker(type = "mix")`
#'# the result of `worker <= df$title` will be hashed into the sparse matrix
#'# the result is `m`
#'}
#'@export
#'@importFrom Rcpp sourceCpp
init_jiebaR_callback <- function() {
if (!requireNamespace("jiebaR", character.only = TRUE)) stop("Please install the package jiebaR first")
tryCatch({
sourceCpp(system.file("callback/jiebaR_callback.cpp", package = "FeatureHashing"))
}, finally = {
})
}
1 change: 1 addition & 0 deletions appveyor.yml
Expand Up @@ -13,6 +13,7 @@ install:

build_script:
- travis-tool.sh install_deps
- travis-tool.sh install_github qinwf/jiebaR

test_script:
- travis-tool.sh run_tests
Expand Down
112 changes: 112 additions & 0 deletions inst/callback/jiebaR_callback.cpp
@@ -0,0 +1,112 @@
// [[Rcpp::depends(jiebaR)]]
// [[Rcpp::depends(FeatureHashing)]]

#include "jiebaRAPI.h"
#include <callback.h>
#include <Rcpp.h>

using namespace Rcpp;

struct jiebaRCallbackFunctor : public CallbackFunctor {

enum Type {
MIX,
MP,
HMM,
QUERY,
KEY
};

Type type;
Environment cutter;
SEXP cutter_pointer;

typedef SEXP (*Cut)(SEXP, SEXP);

Cut cut;

void set_type(std::string _type) {
if (_type.compare("mix") == 0) {
type = MIX;
} else if (_type.compare("mp") == 0) {
type = MP;
} else if (_type.compare("hmm") == 0) {
type = HMM;
} else if (_type.compare("query") == 0) {
type = QUERY;
} else if (_type.compare("key") == 0) {
type = KEY;
} else {
throw std::invalid_argument("Unknown type");
}
}

std::string get_type() {
switch (type) {
case MIX:
return "mix";
case MP:
return "mp";
case HMM:
return "hmm";
case QUERY:
return "query";
case KEY:
return "key";
}
}

void set_cut() {
std::string fname("jiebaR_");
fname.append(get_type());
fname.append("_cut");
cut = reinterpret_cast<Cut>(::R_GetCCallable("jiebaR", fname.c_str()));
}

explicit jiebaRCallbackFunctor(
SEXP _src,
std::string _type,
SEXP _cutter
)
: type(MIX),
cutter(_cutter),
cutter_pointer(NULL),
cut(NULL),
CallbackFunctor(_src)
{
set_type(_type);
set_cut();
cutter_pointer = wrap(cutter["worker"]);
}

virtual ~jiebaRCallbackFunctor() { }

virtual const std::vector<std::string> operator()(const char* input) const {
return as<std::vector<std::string> >((*cut)(wrap(input), cutter_pointer));
}

};

RCPP_MODULE(jiebaR_callback) {

class_<CallbackFunctor>("callback")
;

class_<jiebaRCallbackFunctor>("jiebaR_callback")
.derives<CallbackFunctor>("callback")
.constructor<SEXP, std::string, SEXP>()
.property("type", &jiebaRCallbackFunctor::get_type, &jiebaRCallbackFunctor::set_type)
.field("cutter", &jiebaRCallbackFunctor::cutter)
;

}

/***R
generate_jiebaR_callback <- function(input, type = "mix", ...) {
worker <- jiebaR::worker(type = type, ...)
callback <- new(jiebaR_callback, input, type, worker)
callback
}
FeatureHashing::register_callback("jiebaR", generate_jiebaR_callback)
*/
1 change: 0 additions & 1 deletion inst/include/callback.h
Expand Up @@ -21,7 +21,6 @@

#include <vector>
#include <string>
#include "vector_converter.h"
#include <Rcpp.h>

class CallbackFunctor {
Expand Down
37 changes: 0 additions & 37 deletions inst/include/hash_function.h
Expand Up @@ -22,7 +22,6 @@
#include <cstdint>
#include <map>
#include <string>
#include <Rcpp.h>

class HashFunction {

Expand All @@ -32,40 +31,4 @@ class HashFunction {

};

class NullHashFunction : public HashFunction {

public:

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false);

};

class MurmurHash3HashFunction : public HashFunction {

uint32_t seed;

public :

MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false);

};

class MurmurHash3LogHashFunction : public HashFunction {

uint32_t seed;
Rcpp::Environment e;
std::map<uint32_t, std::string> inverse_mapping;

public:

MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed)
: HashFunction(), seed(_seed), e(_e)
{ }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false);

};

#endif
32 changes: 32 additions & 0 deletions man/init_jiebaR_callback.Rd
@@ -0,0 +1,32 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/callback.R
\name{init_jiebaR_callback}
\alias{init_jiebaR_callback}
\title{Initialize and register jiebaR to the formula interface}
\usage{
init_jiebaR_callback()
}
\description{
Initialize and register jiebaR to the formula interface
}
\details{
This function will register the callback of word segmentation
function provided by jiebaR to the formula interface.
For example, `~ jiebaR(...)` will use the feature of word segmentation
provided by jiebaR to segment a given column of the data.
The first argument of the jiebaR is a character which will be segmented.
The left arguments are the same as \code{\link[jiebaR]{worker}}. These
arguments will be used to initialize a jiebaR worker which will segment
the input data.
}
\examples{
\dontrun{
library(FeatureHashing)
init_jiebaR_callback()
m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df))
# the column `df$title` will be feed into `worker <- worker(type = "mix")`
# the result of `worker <= df$title` will be hashed into the sparse matrix
# the result is `m`
}
}

16 changes: 16 additions & 0 deletions man/ls_special.Rd
@@ -0,0 +1,16 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/callback.R
\name{ls_special}
\alias{ls_special}
\title{List the Registered Specials}
\usage{
ls_special()
}
\value{
character vector. The specials which could be used in the
formula interface.
}
\description{
List the Registered Specials
}

6 changes: 5 additions & 1 deletion man/register_callback.Rd
Expand Up @@ -14,7 +14,11 @@ register_callback(special, callback_generator)
\description{
Register Special Function for Formula Interface
}
\examples{
\details{
The callback_generator is a function whose first argument is the
input data and the other arguments could be used to initialize the callback
function properly. The result should be a Rcpp module which derives the
`CallbackFunctor` class. Please see the vignette for details.
register_callback("split", generate_split_callback)
}

43 changes: 43 additions & 0 deletions src/hash_function_implementation.h
@@ -0,0 +1,43 @@
#ifndef __HASH_FUNCTION_IMPLEMENTATION_HPP__
#define __HASH_FUNCTION_IMPLEMENTATION_HPP__

#include <hash_function.h>
#include <Rcpp.h>

class NullHashFunction : public HashFunction {

public:

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false);

};

class MurmurHash3HashFunction : public HashFunction {

uint32_t seed;

public :

MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false);

};

class MurmurHash3LogHashFunction : public HashFunction {

uint32_t seed;
Rcpp::Environment e;
std::map<uint32_t, std::string> inverse_mapping;

public:

MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed)
: HashFunction(), seed(_seed), e(_e)
{ }

virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false);

};

# endif // __HASH_FUNCTION_IMPLEMENTATION_HPP__
2 changes: 1 addition & 1 deletion src/hashed_model_matrix.h
Expand Up @@ -25,7 +25,7 @@
#include <boost/progress.hpp>
#include <Rcpp.h>
#include "callback.h"
#include "hash_function.h"
#include "hash_function_implementation.h"
#include "vector_converter.h"
#include "converters.h"

Expand Down

0 comments on commit 8476ce4

Please sign in to comment.