# NPI-8: NPI is Luhn number

Description: check if all NPIs are valid luhn numbers. Test will pass if it is true.

Starting Author: Amy Jin (amy@careset.com)

Date: May 3rd, 2018

https://docs.google.com/spreadsheets/d/1IYg01IpssJaWHo6KxO4_dSDgXtYNFy41S5cIHFLvlGQ/edit#gid=604789549

## Connection to Parenthood Server

In [1]:
# Packages import
import os
import sys
import numpy as np
import pandas as pd
from collections import Counter
import operator
import mysql.connector
import sshtunnel
import pureyaml

# Handle path
project_dir = !pwd  # dir of current script/notebook file
config_file = open(project_dir[0] + "/db.yaml");
config = pureyaml.load(config_file.read());

# Argument dictionary for sshtunnel
ssh_config = {
    'ssh_address_or_host': ('parenthood.set.care', 22),
    'ssh_username':        config['ssh_username'],
    'ssh_password':        config['ssh_password'],
    'remote_bind_address': ('127.0.0.1', 3306),
    'local_bind_address':  ('0.0.0.0', 3333),
}

# Argument dictionary for mysql.connector
mysql_config = {
    'user':     config['mysql_user'],
    'password': config['mysql_passwd'],
    'host':     config['mysql_host'],
    'database': 'patch',
    'port':     3333,
}

# Connect to Parenthood server
with sshtunnel.SSHTunnelForwarder(**ssh_config) as tunnel:
    print('SSH tunneling successful on port: {}'.format(tunnel.local_bind_port))
    connection = mysql.connector.connect(**mysql_config)
    cur = connection.cursor()
    print('MySQL server connected successfully!')

SSH tunneling successful on port: 3333
MySQL server connected successfully!


## Test Function

In [2]:
# --------------------------------------- Inputs: ---------------------------------------
# 1) db_name:                database name in server
# 2）table_name:             table name
# 3) col_name:               column to test
# --------------------------------------- Outputs: --------------------------------------
# 1) Test result:            PASS/FAIL
# 2) If FAIL, test will print out all distinct failed NPIs.


def luhn_checksum(card_number):
    # Reference: https://stackoverflow.com/a/21079551/144364
    def digits_of(n):
        return [int(d) for d in str(n)]
    digits = digits_of(card_number)
    odd_digits = digits[-1::-2]
    even_digits = digits[-2::-2]
    checksum = 0
    checksum += sum(odd_digits)
    for d in even_digits:
        checksum += sum(digits_of(d*2))
    return checksum % 10

def is_luhn_valid(card_number):
    return luhn_checksum(card_number) == 0

def is_luhn_npi(npi):
    # **VERY IMPORTANT** Add '80840' in the beginning of the NPI, per the definition by CMS to generate a valid Luhn number 
    # (https://www.cms.gov/Regulations-and-Guidance/Administrative-Simplification/NationalProvIdentStand/Downloads/NPIcheckdigit.pdf)
    return luhn_checksum('80840' + str(npi)) == 0
    

def test_npi_luhn(npi):
    print('Total {} NPIs.'.format(len(npi)))
    n = 0
    for i in npi:
        if not is_luhn_valid('80840'+str(i)):
            n += 1
            print('\tInvalid NPI: {}'.format(i))
    print('Total {} invalid NPIs.'.format(n))
    
def npi_8(db_name, table_name, col_name):

    with sshtunnel.SSHTunnelForwarder(**ssh_config) as tunnel:
        connection = mysql.connector.connect(**mysql_config)
        cur = connection.cursor()
        
        print ("The following rows fail the test:" + '\n')
            
        # MySQL query to get distinct NPI
        query = ('''
                SELECT DISTINCT {col1}
                FROM {db}.{t1}
                WHERE {col1} <> 'GLOBAL';
            ''').format(db = db_name, t1 = table_name, col1 = col_name)

        df = pd.read_sql_query(query, con=connection)
        print(df[df[col_name].apply(lambda x: not is_luhn_npi(x))].to_string(index=False))
            
        cur.close()
        connection.close()

## Test Example

In [3]:
npi_8('client_celgene',' HCP_Geographic_distribution_of_MM_Patients', '`Physician NPI Number`')

The following rows fail the test:



KeyError: '`Physician NPI Number`'

In [7]:
npi_8('_amy', 'test_data_good', 'npi_dest')

The following rows fail the test:

Empty DataFrame
Columns: [npi_dest]
Index: []


In [5]:
npi_8('client_ge_target', 'ge_summary_icd_pair_4q15_2q17_4Q15_2Q17', 'npi')

The following rows fail the test:

npi
9999999992
