Skip to content

Commit

Permalink
Update readstat
Browse files Browse the repository at this point in the history
To fix incorrect NA tagging of negative labelled values.

Fixes #367
  • Loading branch information
hadley committed Jun 20, 2018
1 parent 600a685 commit edec5a0
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 29 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Expand Up @@ -22,6 +22,7 @@

* Fixes out of memory error (#342)
* Now supports reading and writing stata 15 files (#339)
* Negative integer labelled values were tagged as missing (#367)

# haven 1.1.1

Expand Down
30 changes: 8 additions & 22 deletions src/readstat/stata/readstat_dta_read.c
Expand Up @@ -470,9 +470,10 @@ static readstat_error_t dta_read_strls(dta_ctx_t *ctx) {
return retval;
}

static readstat_value_t dta_interpret_int8_bytes(dta_ctx_t *ctx, const unsigned char *buf) {
static readstat_value_t dta_interpret_int8_bytes(dta_ctx_t *ctx, const void *buf) {
readstat_value_t value = { .type = READSTAT_TYPE_INT8 };
int8_t byte = (int8_t)buf[0];
int8_t byte = 0;
memcpy(&byte, buf, sizeof(int8_t));
if (ctx->machine_is_twos_complement) {
byte = ones_to_twos_complement1(byte);
}
Expand All @@ -489,7 +490,7 @@ static readstat_value_t dta_interpret_int8_bytes(dta_ctx_t *ctx, const unsigned
return value;
}

static readstat_value_t dta_interpret_int16_bytes(dta_ctx_t *ctx, const unsigned char *buf) {
static readstat_value_t dta_interpret_int16_bytes(dta_ctx_t *ctx, const void *buf) {
readstat_value_t value = { .type = READSTAT_TYPE_INT16 };
int16_t num = 0;
memcpy(&num, buf, sizeof(int16_t));
Expand All @@ -512,7 +513,7 @@ static readstat_value_t dta_interpret_int16_bytes(dta_ctx_t *ctx, const unsigned
return value;
}

static readstat_value_t dta_interpret_int32_bytes(dta_ctx_t *ctx, const unsigned char *buf) {
static readstat_value_t dta_interpret_int32_bytes(dta_ctx_t *ctx, const void *buf) {
readstat_value_t value = { .type = READSTAT_TYPE_INT32 };
int32_t num = 0;
memcpy(&num, buf, sizeof(int32_t));
Expand All @@ -535,7 +536,7 @@ static readstat_value_t dta_interpret_int32_bytes(dta_ctx_t *ctx, const unsigned
return value;
}

static readstat_value_t dta_interpret_float_bytes(dta_ctx_t *ctx, const unsigned char *buf) {
static readstat_value_t dta_interpret_float_bytes(dta_ctx_t *ctx, const void *buf) {
readstat_value_t value = { .type = READSTAT_TYPE_FLOAT };
float f_num = NAN;
int32_t num = 0;
Expand All @@ -558,7 +559,7 @@ static readstat_value_t dta_interpret_float_bytes(dta_ctx_t *ctx, const unsigned
return value;
}

static readstat_value_t dta_interpret_double_bytes(dta_ctx_t *ctx, const unsigned char *buf) {
static readstat_value_t dta_interpret_double_bytes(dta_ctx_t *ctx, const void *buf) {
readstat_value_t value = { .type = READSTAT_TYPE_DOUBLE };
double d_num = NAN;
int64_t num = 0;
Expand Down Expand Up @@ -1080,12 +1081,6 @@ static readstat_error_t dta_handle_value_labels(dta_ctx_t *ctx) {
if (ctx->bswap) {
for (i=0; i<n; i++) {
off[i] = byteswap4(off[i]);
val[i] = byteswap4(val[i]);
}
}
if (ctx->machine_is_twos_complement) {
for (i=0; i<n; i++) {
val[i] = ones_to_twos_complement4(val[i]);
}
}

Expand All @@ -1095,20 +1090,11 @@ static readstat_error_t dta_handle_value_labels(dta_ctx_t *ctx) {
goto cleanup;
}

readstat_value_t value = { .v = { .i32_value = val[i] }, .type = READSTAT_TYPE_INT32 };
readstat_value_t value = dta_interpret_int32_bytes(ctx, &val[i]);
size_t max_label_len = txtlen - off[i];
if (max_label_len > MAX_VALUE_LABEL_LEN)
max_label_len = MAX_VALUE_LABEL_LEN;

if (val[i] > ctx->max_int32) {
if (ctx->supports_tagged_missing && val[i] > DTA_113_MISSING_INT32) {
value.tag = 'a' + (val[i] - DTA_113_MISSING_INT32_A);
value.is_tagged_missing = 1;
} else{
value.is_system_missing = 1;
}
}

retval = readstat_convert(utf8_buffer, utf8_buffer_len, &txt[off[i]], max_label_len, ctx->converter);
if (retval != READSTAT_OK)
goto cleanup;
Expand Down
28 changes: 21 additions & 7 deletions src/readstat/stata/readstat_dta_write.c
Expand Up @@ -264,22 +264,27 @@ static readstat_error_t dta_emit_typlist(readstat_writer_t *writer, dta_ctx_t *c
return error;
}

static readstat_error_t dta_validate_name(const char *name, size_t max_len) {
static readstat_error_t dta_validate_name_chars(const char *name, int unicode) {
/* TODO check Unicode class */
int j;
for (j=0; name[j]; j++) {
if (name[j] != '_' &&
if ((name[j] > 0 || !unicode) && name[j] != '_' &&
!(name[j] >= 'a' && name[j] <= 'z') &&
!(name[j] >= 'A' && name[j] <= 'Z') &&
!(name[j] >= '0' && name[j] <= '9')) {
return READSTAT_ERROR_NAME_CONTAINS_ILLEGAL_CHARACTER;
}
}
char first_char = name[0];
if (first_char != '_' &&
if ((first_char > 0 || !unicode) && first_char != '_' &&
!(first_char >= 'a' && first_char <= 'z') &&
!(first_char >= 'A' && first_char <= 'Z')) {
return READSTAT_ERROR_NAME_BEGINS_WITH_ILLEGAL_CHARACTER;
}
return READSTAT_OK;
}

static readstat_error_t dta_validate_name_unreserved(const char *name) {
if (strcmp(name, "_all") == 0 || strcmp(name, "_b") == 0 ||
strcmp(name, "byte") == 0 || strcmp(name, "_coef") == 0 ||
strcmp(name, "_cons") == 0 || strcmp(name, "double") == 0 ||
Expand All @@ -296,22 +301,31 @@ static readstat_error_t dta_validate_name(const char *name, size_t max_len) {
if (sscanf(name, "str%d", &len) == 1)
return READSTAT_ERROR_NAME_IS_RESERVED_WORD;

return READSTAT_OK;
}

static readstat_error_t dta_validate_name(const char *name, int unicode, size_t max_len) {
readstat_error_t error = READSTAT_OK;

if ((error = dta_validate_name_chars(name, unicode)) != READSTAT_OK)
return error;

if (strlen(name) > max_len)
return READSTAT_ERROR_NAME_IS_TOO_LONG;

return READSTAT_OK;
return dta_validate_name_unreserved(name);
}

static readstat_error_t dta_old_variable_ok(readstat_variable_t *variable) {
return dta_validate_name(readstat_variable_get_name(variable), DTA_OLD_MAX_NAME_LEN);
return dta_validate_name(readstat_variable_get_name(variable), 0, DTA_OLD_MAX_NAME_LEN);
}

static readstat_error_t dta_110_variable_ok(readstat_variable_t *variable) {
return dta_validate_name(readstat_variable_get_name(variable), DTA_110_MAX_NAME_LEN);
return dta_validate_name(readstat_variable_get_name(variable), 0, DTA_110_MAX_NAME_LEN);
}

static readstat_error_t dta_118_variable_ok(readstat_variable_t *variable) {
return dta_validate_name(readstat_variable_get_name(variable), DTA_118_MAX_NAME_LEN);
return dta_validate_name(readstat_variable_get_name(variable), 1, DTA_118_MAX_NAME_LEN);
}

static readstat_error_t dta_emit_varlist(readstat_writer_t *writer, dta_ctx_t *ctx) {
Expand Down

0 comments on commit edec5a0

Please sign in to comment.