# Check that the C++ propensity score matcher produces the same results as the Python version

Last updated: 2019-10-22

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
from sortedcontainers import SortedList

## Read data

In [2]:
data = pd.read_csv("../data/scores.tsv", sep='\t', names=["MMI_ID", "pred_prob", "used_drug"])

In [3]:
data.shape

(1000000, 3)

In [4]:
data.head()

Unnamed: 0,MMI_ID,pred_prob,used_drug
0,0,0.746445,1
1,1,0.401794,1
2,2,0.207886,1
3,3,0.551139,1
4,4,0.522224,1


---

## This version matches positive MMI_IDs to negative scores

This is because this makes the C++ code simpler.

In [5]:
def match_pairs_test(people, caliper=0.05):
    
    PRECISION = 1000000
    MAX_DIST = int(caliper * PRECISION)
    
    scores = people.assign(
        prop_score = lambda df: np.floor(df["pred_prob"].mul(PRECISION)).astype(int)
    )
    
    neg_ppl = scores.query("used_drug == 0")
    
    neg_scores = SortedList(neg_ppl["prop_score"])
    
    matched_scores = []
    pos_ppl = scores.query("used_drug == 1")
    for row in pos_ppl.itertuples():
        target = row.prop_score
        
        idx = neg_scores.bisect_left(target)
        
        if idx == len(neg_scores):
            idx -= 1
            
        if idx == -1:
            break
            
        closest_val = neg_scores[idx]
        
        if idx > 0:
            prev_val = neg_scores[idx - 1]
            
            if abs(target - prev_val) < abs(target - closest_val):
                closest_val = prev_val
                
        if abs(target - closest_val) > MAX_DIST:
            continue
            
        neg_scores.remove(closest_val)
        
        matched_scores.append((row.MMI_ID, closest_val))
        
        
    return pd.DataFrame(matched_scores, columns=["MMI_ID", "score"])

In [6]:
py_res = match_pairs_test(data)

In [7]:
py_res.shape

(222338, 2)

In [8]:
py_res.head()

Unnamed: 0,MMI_ID,score
0,0,746446
1,1,401793
2,2,207887
3,3,551136
4,4,522226


In [9]:
%%time

py_res = match_pairs_test(data)

CPU times: user 5.61 s, sys: 71.7 ms, total: 5.68 s
Wall time: 5.68 s


---

## Read C++ results

In [10]:
c_res = pd.read_csv("../pipeline/cpp_orig_output.txt", sep='\t', names=["MMI_ID", "score"])

In [11]:
c_res.shape

(222338, 2)

In [12]:
c_res.head()

Unnamed: 0,MMI_ID,score
0,0,746446
1,1,401793
2,2,207887
3,3,551136
4,4,522226


---

## Check that C++ results are the same as Python

In [13]:
py_res.equals(c_res)

True

The C++ version gave the same result as the Python version, but is around 2x faster.