Skip to content

Commit

Permalink
ENH: I18N - Add new Semantic Type - IDENTITY.NPI_US (National Provide…
Browse files Browse the repository at this point in the history
…r Identifier (US)) + DateTimeParser: Add support for date detection of the form yyyyMM and yyyyMMddHH (#39)
  • Loading branch information
tsegall committed May 7, 2023
1 parent 9cc39ea commit 279bced
Show file tree
Hide file tree
Showing 14 changed files with 222 additions and 34 deletions.
5 changes: 5 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@

## Changes ##

### 13.7.0
- ENH: I18N - Add new Semantic Type - IDENTITY.NPI_US (National Provider Identifier (US))
- ENH: DateTimeParser: Add support for date detection of the form yyyyMM and yyyyMMddHH (#39)
- ENH: Improve detection of timestamps (minor)

### 13.6.2
- ENH: Add support for period detection of the form yyyyMM (#38)
- ENH: I18N - yyyyMMdd detection (now look for localized "date" header)
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ IDENTITY.EIN_US|Employer Identification Number|en-US
IDENTITY.INDIVIDUAL_NUMBER_JA|Individual Number / My Number (Japan)|ja
IDENTITY.NHS_UK|NHS Number|en-UK
IDENTITY.NI_UK|National Insurance Number (UK)|en-UK
IDENTITY.NPI_US|National Provider Identifier (US)|en-US
IDENTITY.PERSONNUMMER_SE|Personal identity number (Sweden)|sv-SE
IDENTITY.SSN_FR|Social Security Number (France)|fr-FR
IDENTITY.SSN_CH|AVH Number / SSN (Switzerland)|de-CH, fr-CH, it-CH
Expand Down Expand Up @@ -700,7 +701,7 @@ Repository](https://mvnrepository.com/artifact/com.cobber.fta/fta) or [Maven.org

## Javadoc ##

Javadoc is automatically updated to reflect the latest release at http://javadoc.io/doc/com.cobber.fta/fta/.
Javadoc is automatically updated to reflect the latest release at http://javadoc.io/doc/com.cobber.fta/fta/ and http://javadoc.io/doc/com.cobber.fta/fta-core/ .

## Speed ##

Expand Down
40 changes: 39 additions & 1 deletion core/src/main/java/com/cobber/fta/dates/DateTimeParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -760,15 +760,32 @@ private char jaCnDateTimeMapper(final char ch) {
}
}

private boolean validCentury(final String trimmed) {
final char initial = trimmed.charAt(0);
return initial == '1' || initial == '2';
}

private String passNumeric(final String trimmed, final SimpleDateMatcher matcher, final DateResolutionMode resolutionMode) {
final String compressed = matcher.getCompressed();

if ("d{4}".equals(compressed)) {
// To assume it is a year it needs to be in the range [EARLY_LONG_YYYY,LATE_LONG_YYYY]
if (!validCentury(trimmed))
return null;
final int year = Utils.getValue(trimmed, 0, 4, 4);
return year >= EARLY_LONG_YYYY && year <= LATE_LONG_YYYY ? "yyyy" : null;
}

// 6 digits so should be looking at yyyyMM (it could of course be yyyyqq :-( )
if ("d{6}".equals(compressed)) {
if (!validCentury(trimmed))
return null;
final int year = Utils.getValue(trimmed, 0, 4, 4);
if (year < RECENT_EARLY_LONG_YYYY || year > LATE_LONG_YYYY)
return null;
final int MM = Utils.getValue(trimmed, 4, 2, 2);
return MM != 0 && MM <= 12 ? "yyyyMM" : null;
}

// 8 digits so should be looking at yyyyMMdd, MMddyyyy, or ddMMyyyy
if ("d{8}".equals(compressed)) {
// Split input digits (AABBCCDD) into four ints AA, BB, CC, DD
Expand Down Expand Up @@ -803,6 +820,27 @@ private String passNumeric(final String trimmed, final SimpleDateMatcher matcher
return null;
}

// 10 digits so could be looking at yyyyMMddHH
if ("d{10}".equals(compressed)) {
if (!validCentury(trimmed))
return null;
final int yyyy = Utils.getValue(trimmed, 0, 4, 4);
if (yyyy < RECENT_EARLY_LONG_YYYY || yyyy > LATE_LONG_YYYY)
return null;
final int MM = Utils.getValue(trimmed, 4, 2, 2);
if (MM > 12)
return null;
final int dd = Utils.getValue(trimmed, 6, 2, 2);
if (dd > 31)
return null;
final int HH = Utils.getValue(trimmed, 8, 2, 2);
if (HH >= 24)
return null;
if (!isValidDate(yyyy, MM, dd))
return null;
return "yyyyMMddHH";
}

// 12 or 14 digits so we are looking at yyyyMMddHHmm (12) or yyyyMMddHHmmss (14)
if ("d{12}".equals(compressed) || "d{14}".equals(compressed)) {
final int yyyy = Utils.getValue(trimmed, 0, 4, 4);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ public class DateTimeParserConfig {
public boolean strictMode;
/** If Numeric mode is set, any numeric-only input to train() will be tested to see if it appears to be a date (Default: true). */
public boolean numericMode = true;
/** If noAbbreviationPunctuation is set we should use Month Abbreviations without periods, for example for the
* Canadian locale, Java returns 'AUG.', and similarly for the AM/PM string which are defined in as A.M and P.M. */
/** If noAbbreviationPunctuation is set we should use Month Abbreviations without periods, for example in the
* Canadian locale, Java returns 'AUG.', and similarly for the AM/PM string which are defined as A.M and P.M. (Default: true). */
public boolean noAbbreviationPunctuation = true;
/** lenient allows dates of the form '00/00/00' etc to be viewed as valid for the purpose of Format detection. */
/** lenient allows dates of the form '00/00/00' etc to be viewed as valid for the purpose of Format detection (Default: true). */
public boolean lenient = true;

public DateTimeParserConfig(final Locale... locales) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,19 @@ public void digits4() {
assertNull(dtp.determineFormatString("1499"));
}

@Test(groups = { TestGroups.ALL, TestGroups.DATETIME })
public void digits6() {
final DateTimeParser dtpNumericDayFirst = new DateTimeParser().withDateResolutionMode(DateResolutionMode.DayFirst);

assertEquals(dtpNumericDayFirst.determineFormatString("201407"), "yyyyMM");
assertNull(dtpNumericDayFirst.determineFormatString("201213"));


final DateTimeParser dtpNonNumeric = new DateTimeParser().withNumericMode(false);

assertNull(dtpNonNumeric.determineFormatString("201407"));
}

@Test(groups = { TestGroups.ALL, TestGroups.DATETIME })
public void digits8() {
final DateTimeParser dtpNumericDayFirst = new DateTimeParser().withDateResolutionMode(DateResolutionMode.DayFirst);
Expand Down Expand Up @@ -320,6 +333,19 @@ public void digits8() {
assertNull(dtpNonNumeric.determineFormatString("20140722105203"), "yyyyMMddHHmmss");
}

@Test(groups = { TestGroups.ALL, TestGroups.DATETIME })
public void digits10() {
final DateTimeParser dtpNumericDayFirst = new DateTimeParser().withDateResolutionMode(DateResolutionMode.DayFirst);

assertEquals(dtpNumericDayFirst.determineFormatString("2014072210"), "yyyyMMddHH");
assertNull(dtpNumericDayFirst.determineFormatString("2012131232"));


final DateTimeParser dtpNonNumeric = new DateTimeParser().withNumericMode(false);

assertNull(dtpNonNumeric.determineFormatString("2014072210"));
}

@Test(groups = { TestGroups.ALL, TestGroups.DATETIME })
public void digits8Harder() {
final DateTimeParser dtpNumericDayFirst = new DateTimeParser().withDateResolutionMode(DateResolutionMode.DayFirst);
Expand Down
8 changes: 4 additions & 4 deletions examples/datetraining/build.gradle
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
plugins {
id 'application'
id 'application'
}

wrapper {
gradleVersion = '8.1.1'
}

repositories {
mavenCentral()
mavenCentral()
}

dependencies {
implementation group: 'com.cobber.fta', name: 'fta-core', version: "13.+"
implementation 'com.cobber.fta:fta:13.+'
}

application {
mainClass = 'datetraining.DateTraining'
mainClass = 'datetraining.DateTraining'
}
10 changes: 5 additions & 5 deletions examples/minicli/build.gradle
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
plugins {
id 'application'
id 'application'
}

wrapper {
gradleVersion = '8.1.1'
}

repositories {
mavenCentral()
mavenCentral()
}

dependencies {
implementation group: 'com.cobber.fta', name: 'fta', version: "13.+"
implementation 'com.univocity:univocity-parsers:2.9.1'
implementation 'com.cobber.fta:fta:13.+'
implementation 'com.univocity:univocity-parsers:2.9.1'
}

application {
mainClass = 'cli.Cli'
mainClass = 'cli.Cli'
}
8 changes: 4 additions & 4 deletions examples/modebulk/build.gradle
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
plugins {
id 'application'
id 'application'
}

wrapper {
gradleVersion = '8.1.1'
}

repositories {
mavenCentral()
mavenCentral()
}

dependencies {
implementation group: 'com.cobber.fta', name: 'fta', version: "13.+"
implementation 'com.cobber.fta:fta:13.+'
}

application {
mainClass = 'modebulk.ModeBulk'
mainClass = 'modebulk.ModeBulk'
}
8 changes: 4 additions & 4 deletions examples/moderecord/build.gradle
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
plugins {
id 'application'
id 'application'
}

wrapper {
gradleVersion = '8.1.1'
}

repositories {
mavenCentral()
mavenCentral()
}

dependencies {
implementation group: 'com.cobber.fta', name: 'fta', version: "13.+"
implementation 'com.cobber.fta:fta:13.+'
}

application {
mainClass = 'moderecord.ModeRecord'
mainClass = 'moderecord.ModeRecord'
}
8 changes: 4 additions & 4 deletions examples/modestreaming/build.gradle
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
plugins {
id 'application'
id 'application'
}

wrapper {
gradleVersion = '8.1.1'
}

repositories {
mavenCentral()
mavenCentral()
}

dependencies {
implementation group: 'com.cobber.fta', name: 'fta', version: "13.+"
implementation 'com.cobber.fta:fta:13.+'
}

application {
mainClass = 'modestreaming.ModeStreaming'
mainClass = 'modestreaming.ModeStreaming'
}
2 changes: 1 addition & 1 deletion examples/web/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies {
implementation 'org.springframework.boot:spring-boot-starter-thymeleaf'
implementation 'org.springframework.boot:spring-boot-starter-web'
implementation 'com.univocity:univocity-parsers:2.9.1'
implementation fileTree(dir: '/Users/tsegall/src/fta/cli/build/install/fta/lib', include: '*.jar')
implementation 'com.cobber.fta:fta:13.+'
providedRuntime('org.springframework.boot:spring-boot-starter-tomcat')
testImplementation 'org.springframework.boot:spring-boot-starter-test'
}
Expand Down
2 changes: 1 addition & 1 deletion settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ includeBuild 'examples/modestreaming'
dependencyResolutionManagement {
versionCatalogs {
libs {
version('fta', '13.6.2')
version('fta', '13.7.0')
version('jacoco', '0.8.8')

// https://mvnrepository.com/artifact/com.univocity/univocity-parsers
Expand Down
103 changes: 103 additions & 0 deletions types/src/main/java/com/cobber/fta/plugins/identity/NPI_US.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Copyright 2017-2023 Tim Segall
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cobber.fta.plugins.identity;

import org.apache.commons.validator.routines.checkdigit.CheckDigit;
import org.apache.commons.validator.routines.checkdigit.CheckDigitException;
import org.apache.commons.validator.routines.checkdigit.LuhnCheckDigit;

import com.cobber.fta.AnalysisConfig;
import com.cobber.fta.AnalyzerContext;
import com.cobber.fta.Facts;
import com.cobber.fta.FiniteMap;
import com.cobber.fta.LogicalTypeInfinite;
import com.cobber.fta.PluginAnalysis;
import com.cobber.fta.PluginDefinition;
import com.cobber.fta.core.FTAPluginException;
import com.cobber.fta.core.Utils;
import com.cobber.fta.token.TokenStreams;

/**
* Plugin to detect a US National Provider Identifier (NPI).
*
*/
public class NPI_US extends LogicalTypeInfinite {
private CheckDigit validator;
private static final String NPI_PREFIX = "80840";

/**
* Construct a plugin to detect a US NPI based on the Plugin Definition.
* @param plugin The definition of this plugin.
*/
public NPI_US(final PluginDefinition plugin) {
super(plugin);
}

@Override
public boolean isCandidate(final String trimmed, final StringBuilder compressed, final int[] charCounts, final int[] lastIndex) {
return validate(trimmed);
}

private boolean validate(final String trimmed) {

if (trimmed.length() != 10 || !Utils.isNumeric(trimmed))
return false;

try {
// Validate the Luhn check digit
final String checkDigit = validator.calculate(NPI_PREFIX + trimmed.substring(0, 9));
return checkDigit.charAt(0) == trimmed.charAt(9);
} catch (CheckDigitException e) {
return false;
}
}

@Override
public boolean initialize(final AnalysisConfig analysisConfig) throws FTAPluginException {
super.initialize(analysisConfig);

validator = new LuhnCheckDigit();

return true;
}

@Override
public String nextRandom() {
try {
final String base = Utils.getRandomDigits(random, 9);
return base + validator.calculate(NPI_PREFIX + base);
} catch (CheckDigitException e) {
return null;
}
}

@Override
public String getRegExp() {
return "\\d{10}";
}

@Override
public boolean isValid(final String input, final boolean detectMode, final long count) {
return validate(input.trim());
}

@Override
public PluginAnalysis analyzeSet(final AnalyzerContext context, final long matchCount, final long realSamples, final String currentRegExp,
final Facts facts, final FiniteMap cardinality, final FiniteMap outliers, final TokenStreams tokenStreams,
AnalysisConfig analysisConfig) {
return getConfidence(matchCount, realSamples, context) >= getThreshold()/100.0 ? PluginAnalysis.OK : PluginAnalysis.SIMPLE_NOT_OK;
}
}
Loading

0 comments on commit 279bced

Please sign in to comment.