In [None]:
BASE_DIR='/home/thanuja/Dropbox/coursera/Milestone1/data/'

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from itertools import chain
from pyspark.sql import types as t
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.sql.types import StructType,StructField, StringType,IntegerType
import pandas as pd



In [None]:
#pyspark initialization
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .appName('cms_physicians_analysis') \
    .getOrCreate()
sc = spark.sparkContext

In [None]:
import math

# reads files,finds match score and outputs the matched,unmatched rows into files.
class Row:
    def __init__(self, file):
        line = file.readline()
        self.eof = False
        if not line:
            raise EOFError('End of input')
        data = line.split(',')
        self.id = data[0]
        self.name = (data[1], data[2])
        self.address = data[3].strip()
        self.state_city = data[4].strip()
        self.taxonomies = data[5].strip()
        self.middle_name = data[6].strip()
        self.data = [self.address, self.state_city, self.taxonomies, self.middle_name]
    
    def write(self, file):
        file.write(f'{self.id},{self.name[0]},{self.name[1]},{self.address},{self.state_city},{self.taxonomies},{self.middle_name}\n')
    
    def __str__(self):
        return self.id + '|' + self.name[0] + ' ' + self.name[1]
    
    def __repr__(self):
        return self.__str__()

# Assumes row1.name == row2.name
def write_match(file, match):
    row1, row2, score, score_breakdown = match
    file.write(f'{row1.name[0]},{row1.name[1]},{row1.id},{row2.id},{score},'
               f'{row1.address},{score_breakdown[0]},{row2.address},'
               f'{row1.state_city},{score_breakdown[1]},{row2.state_city},'
               f'{row1.taxonomies},{score_breakdown[2]},{row2.taxonomies},'
               f'{row1.middle_name},{score_breakdown[3]},{row2.middle_name}\n')

NUM_FIELDS=4
#array[[addresses],[state and citie],[taxonomies],[middle names]]
def get_array(rows):
    result = []
    for field in range(NUM_FIELDS):
        column = []
        for row in rows:
            data = row.data[field].strip(' "\'')
            column.append(data)
        result.append(column)
    return result
#uses TF-IDF to transform text into vectors and then applies cosine similarity. Finds the best score based on cosine similarity.
def match_rows(npi_rows, suppl_rows):
    npi_array = get_array(npi_rows)
    #print('npi_array', len(npi_array), npi_array)
    suppl_array = get_array(suppl_rows)
    #print('suppl array', len(suppl_array), suppl_array)

    npi_matrixes = []
    suppl_matrixes = []
    for field in range(NUM_FIELDS):
        # Modify pattern to allow single letter tokens, for middle initial and street directions
        vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",
                                    stop_words=['MEDICINE', 'SUITE', 'STE'])
        # Add 'STOP' to prevent vocabulary from being empty, which causes error
        npi_field = vectorizer.fit_transform(npi_array[field] + ['STOP'])
        suppl_field = vectorizer.transform(suppl_array[field] + ['STOP'])
        #print('npi_field shape', npi_field.shape, npi_field.shape)
        npi_matrixes.append(npi_field)
        suppl_matrixes.append(suppl_field)
    #print('npi_matrixes len', len(npi_matrixes))
    #print('suppl_matrixes len', len(suppl_matrixes))
    
    result = []

    for i in range(len(suppl_array[0])):
        best_j = 0
        best_score = -1
        best_score_breakdown = []
        for j in range(len(npi_array[0])):
            score = 0
            score_breakdown = np.zeros((NUM_FIELDS))
            for field in range(NUM_FIELDS):
                score_breakdown[field] = cosine_similarity(suppl_matrixes[field][i], npi_matrixes[field][j])
                #print('score', score_breakdown[field], 'suppl', suppl_array[field][i], 'npi', npi_array[field][j])
            score = score_breakdown.mean()
            if best_score < score:
                best_score = score
                best_score_breakdown = score_breakdown
                best_j = j
        #print('best score', best_score, best_score_breakdown)
        result.append((npi_rows[best_j], suppl_rows[i], best_score, best_score_breakdown))
    return result

In [None]:
#traverses through the hcp_suppl.csv and hcp_npi.csv and finds the matches and writes it to file.
# unmatched rows are written in hcp_npi_unmatched.csv and hcp_suppl_unmatched.csv
# ideally bad skips should not occur if the input files are sorted.
OUT_DIR = BASE_DIR + 'data_processing/matched_out/'
IN_DIR = BASE_DIR + 'data_processing/combined_out/'
npi_sorted_file = open(IN_DIR + 'hcp_npi.csv', 'rt')
suppl_sorted_file = open(IN_DIR + 'hcp_suppl.csv', 'rt')
matches_file = open(OUT_DIR + 'hcp_matches.csv', 'wt')
npi_unmatched_file = open(OUT_DIR + 'hcp_npi_unmatched.csv', 'wt')
suppl_unmatched_file = open(OUT_DIR + 'hcp_suppl_unmatched.csv', 'wt')

npi_unmatched = []
suppl_unmatched = []
npi_row = Row(npi_sorted_file)
suppl_row = Row(suppl_sorted_file)
npi_line = 0
suppl_line = 0
npi_unmatched = 0
suppl_unmatched = 0
matched = 0

def inc_line(name, line_no):
    if line_no % 10000 == 0:
        print(name, line_no)
    return line_no + 1

def inc_npi():
    global npi_line
    npi_line = inc_line('npi %s matched=%d unmatched=%d' % (npi_row.name, matched, npi_unmatched), npi_line)

def inc_suppl():
    global suppl_line
    suppl_line = inc_line('suppl %s matched=%d unmatched=%d' % (suppl_row.name, matched, suppl_unmatched), suppl_line)

#limit = 1200000
debug = False
try:
    while True:#npi_line < limit and suppl_line < limit:
        if npi_line == 0:
            # skip header
            print('skipping npi', npi_row.data)
            print('skipping suppl', suppl_row.data)
            npi_row = Row(npi_sorted_file)
            suppl_row = Row(suppl_sorted_file)
            inc_npi()
            inc_suppl()
            continue

        npi_skipped = []
        suppl_skipped = []
        while npi_row.name != suppl_row.name:
            while npi_row.name < suppl_row.name:
                if debug:
                    print('npi', npi_row.name, '<', suppl_row.name)
                npi_skipped.append(npi_row.name)
                npi_row.write(npi_unmatched_file)
                npi_unmatched += 1
                npi_row = Row(npi_sorted_file)
                inc_npi()
            while suppl_row.name < npi_row.name:
                if debug:
                    print('suppl', suppl_row.name, '<', npi_row.name)
                suppl_skipped.append(suppl_row.name)
                suppl_row.write(suppl_unmatched_file)
                suppl_unmatched += 1
                suppl_row = Row(suppl_sorted_file)
                inc_suppl()

        matching_name = npi_row.name # = suppl_row.name
        #print('matching name', matching_name)
        npi_rows = [npi_row]
        suppl_rows = [suppl_row]
        while True:
            npi_row = Row(npi_sorted_file)
            inc_npi()
            if npi_row.name == matching_name:
                npi_rows.append(npi_row)
            else:
                break
        while True:
            suppl_row = Row(suppl_sorted_file)
            inc_suppl()
            if suppl_row.name == matching_name:
                suppl_rows.append(suppl_row)
            else:
                break
        #print('npi_rows', npi_rows)
        #print('suppl_rows', suppl_rows)

        matches = match_rows(npi_rows, suppl_rows)
        #print('debug', debug)
        bad_skips = set(npi_skipped).intersection(suppl_skipped)
        if len(bad_skips) > 0 or debug:
            if len(npi_skipped) > 0:
                print('npi skipped:', npi_skipped)
            if len(suppl_skipped) > 0:
                print('suppl skipped:', suppl_skipped)

            #if len(npi_skipped) > 10 or len(suppl_skipped) > 10:
            print('matches %-30s: %d matched, npi: %2d/%2d, suppl: %2d/%2d' % (matching_name,
                                                                           len(matches),
                                                                           len(npi_rows),
                                                                           len(npi_rows) + len(npi_skipped),
                                                                           len(suppl_rows),
                                                                           len(suppl_rows) + len(suppl_skipped)))
        if len(bad_skips) > 0:
            print('BAD SKIPS', len(bad_skips), bad_skips)
            print('npi', npi_skipped)
            print('suppl', suppl_skipped)
            break

        for match in matches:
            matched += 1
            write_match(matches_file, match)
        #print(matches)
except EOFError as e:
    print('End of input')

npi_unmatched_file.close()
suppl_unmatched_file.close()
matches_file.close()
npi_sorted_file.close()
suppl_sorted_file.close()