In [1]:
BASE_DIR='/home/thanuja/Dropbox/coursera/Milestone1/data/'

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from itertools import chain
from pyspark.sql import types as t
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import pandas as pd

In [3]:
#pyspark initialization
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .appName('cms_physicians_analysis') \
    .getOrCreate()
sc = spark.sparkContext

In [4]:
schema = StructType([
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("NPI", StringType(), True),
    StructField("PPI", StringType(), True),
    StructField("Score", FloatType(), True),
    StructField("NatAddr", StringType(), True),
    StructField("AddrScore", FloatType(), True),
    StructField("SupplAddr", StringType(), True),
    StructField("NatStateCity", StringType(), True),
    StructField("StateCityScore", FloatType(), True),
    StructField("SupplStateCity", StringType(), True),
    StructField("NatSpecialty", StringType(), True),
    StructField("SpecialtyScore", FloatType(), True),
    StructField("SupplSpecialty", StringType(), True),
    StructField("NatMiddleName", StringType(), True),
    StructField("MiddleNameScore", FloatType(), True),
    StructField("SupplMiddleName", StringType(), True)])

hcp_matches = spark.read\
    .csv(BASE_DIR + "data_processing/matched_out/hcp_matches.csv", header=False, schema=schema)
# we filter the score threshold to be 0.3 and above to eliminate possible mismatches in the mappings.
hcp_matches = hcp_matches.filter('Score > 0.3')

duplicates = hcp_matches.groupBy('NPI').count().filter('count > 1')
print('duplicates', duplicates.withColumn('NPI', concat_ws(' ', duplicates['NPI'])).show())
print('duplicate count', duplicates.count())

joined = hcp_matches.join(duplicates, on='NPI', how='inner')
joined.show(truncate=True)
joined.coalesce(1).write.option("header", "true").csv(BASE_DIR + "data_processing/duplicates_out", sep=',')

hcp_matches.coalesce(1).write.option("header", "true").csv(BASE_DIR + "data_processing/filtered_out", sep=',')

+----------+-----+
|       NPI|count|
+----------+-----+
|1982613865|    2|
|1659786481|    2|
|1588922215|    2|
|1285603423|    2|
|1154484657|    2|
|1396795720|    2|
|1699782524|    2|
|1417009242|    2|
|1710072491|    2|
|1952460917|    2|
|1174520126|    2|
|1528453438|    2|
|1922186410|    2|
|1558351320|    2|
|1588640981|    2|
|1104997162|    2|
|1295129294|    2|
|1114412384|    2|
|1093815003|    2|
|1750575163|    2|
+----------+-----+
only showing top 20 rows

duplicates None
duplicate count 2807
+----------+---------+---------+-------+----------+--------------------+----------+--------------------+--------------------+--------------+--------------------+--------------------+--------------+--------------------+-------------+---------------+---------------+-----+
|       NPI|FirstName| LastName|    PPI|     Score|             NatAddr| AddrScore|           SupplAddr|        NatStateCity|StateCityScore|      SupplStateCity|        NatSpecialty|SpecialtyScore|      SupplSp