# Setup

In [1]:
import pandas as pd

# TRANSACTION_INDEX = ['RC#', 'Category', 'Vendor#']
TRANSACTION_INDEX = ['TPN_ID']

In [2]:
original = pd.read_csv("./data/original_sorted.csv")

In [3]:
original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   RC#          5568 non-null   int64 
 1   RC Name      5568 non-null   object
 2   Category     5568 non-null   int64 
 3   Vendor#      5568 non-null   int64 
 4   Vendor Name  5568 non-null   object
 5   Address 1    5544 non-null   object
 6   Address 2    1290 non-null   object
 7   City         5469 non-null   object
 8   State        5371 non-null   object
 9   ZIP code     5557 non-null   object
 10  Amount       5568 non-null   int64 
 11  TPN_ID       5568 non-null   object
dtypes: int64(4), object(8)
memory usage: 522.1+ KB


In [4]:
updated = pd.read_csv("./data/updated_sorted.csv").set_index(TRANSACTION_INDEX)

In [5]:
updated.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5920 entries, 1-6000-100877-1 to 94-6900-328199-1
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   RC#          5920 non-null   int64 
 1   RC Name      5920 non-null   object
 2   Category     5920 non-null   int64 
 3   Vendor#      5920 non-null   int64 
 4   Vendor Name  5920 non-null   object
 5   Address 1    5896 non-null   object
 6   Address 2    1359 non-null   object
 7   City         5821 non-null   object
 8   State        5716 non-null   object
 9   ZIP code     5909 non-null   object
 10  Amount       5920 non-null   int64 
dtypes: int64(4), object(7)
memory usage: 555.0+ KB


# Top-line numbers

In [6]:
original_sum = original.Amount.sum()
updated_sum = updated.Amount.sum()

updated_sum - original_sum

30599050

In [7]:
new_transactions = len(updated) - len(original)
new_transactions

352

In [8]:
diff = pd.concat([original, updated]).drop_duplicates(keep=False)

In [9]:
total_transactions_changed = len(diff)
total_transactions_changed

11488

In [10]:
original_transactions_changed = total_transactions_changed - new_transactions
original_transactions_changed

11136

# Merge transactions

In [11]:
merged_transactions = original.merge(updated, on=TRANSACTION_INDEX, suffixes=('_o', '_u'))
merged_transactions['AmountDiff'] = merged_transactions.Amount_u - merged_transactions.Amount_o

# Largest amount change

In [12]:
merged_transactions.sort_values(by=['AmountDiff'], ascending=False).head(5)

Unnamed: 0,RC#_o,RC Name_o,Category_o,Vendor#_o,Vendor Name_o,Address 1_o,Address 2_o,City_o,State_o,ZIP code_o,...,Category_u,Vendor#_u,Vendor Name_u,Address 1_u,Address 2_u,City_u,State_u,ZIP code_u,Amount_u,AmountDiff
4767,85,SOMD Administration,7300,424204,BPA II LTD,1468 WEST 9TH STREET SUITE 835,,CLEVELAND,OH,44113,...,7300,424204,BPA II LTD,1468 WEST 9TH STREET SUITE 835,,CLEVELAND,OH,44113,10951439,2584599
5292,92,Business and Auxiliary Services,7300,505445,WYNDHAM PITTSBURGH UNIVERSITY CENTER,545 E JOHN CARPENTER FREEWAY STE 1300,,IRVING,TX,75062,...,7300,505445,WYNDHAM PITTSBURGH UNIVERSITY CENTER,545 E JOHN CARPENTER FREEWAY STE 1300,,IRVING,TX,75062,11657790,2023060
4413,83,General University,7700,204776,"MARSH USA, INC",P O BOX 281915,,ATLANTA,GA,30384,...,7700,204776,"MARSH USA, INC",P O BOX 281915,,ATLANTA,GA,30384,6071082,1735802
5298,92,Business and Auxiliary Services,7300,687995,MWK FORBES II LLC,3341 FORBES AVENUE,,PITTSBURGH,PA,15213,...,7300,687995,MWK FORBES II LLC,3341 FORBES AVENUE,,PITTSBURGH,PA,15213,6280388,1112046
3798,67,Facilities Management,7000,103658,DUQUESNE LIGHT COMPANY,PO BOX 830012,,BALTIMORE,MD,21283-0012,...,7000,103658,DUQUESNE LIGHT COMPANY,PO BOX 830012,,BALTIMORE,MD,21283-0012,24578216,747194


In [13]:
merged_transactions.loc[
    (merged_transactions['RC Name_u'] == 'Kenneth P. Dietrich School of Arts and Sciences') &
    (merged_transactions['Vendor#_u'] == 'MEDLINE INDUSTRIES INC')
]

Unnamed: 0,RC#_o,RC Name_o,Category_o,Vendor#_o,Vendor Name_o,Address 1_o,Address 2_o,City_o,State_o,ZIP code_o,...,Category_u,Vendor#_u,Vendor Name_u,Address 1_u,Address 2_u,City_u,State_u,ZIP code_u,Amount_u,AmountDiff


In [14]:
mg2 = original.merge(updated.drop_duplicates(), on=TRANSACTION_INDEX, how='right', indicator=True)
mg2[mg2._merge == "right_only"]

Unnamed: 0,RC#_x,RC Name_x,Category_x,Vendor#_x,Vendor Name_x,Address 1_x,Address 2_x,City_x,State_x,ZIP code_x,...,Category_y,Vendor#_y,Vendor Name_y,Address 1_y,Address 2_y,City_y,State_y,ZIP code_y,Amount_y,_merge
32,,,,,,,,,,,...,6400,248552,TEAM BEANS LLC,11 ELKINS RD,,EAST BRUNSWICK,NJ,08816-2006,8208,right_only
79,,,,,,,,,,,...,6400,683094,FOUR KITCHENS LLC,4300 SPEEDWAY STE 49019,,AUSTIN,TX,78765,5652,right_only
255,,,,,,,,,,,...,6400,115138,LAWSON PRODUCTS INC,135 S LASALLE DEPT 2689,,CHICAGO,IL,60674-2689,4401,right_only
256,,,,,,,,,,,...,6400,163005,MEDLINE INDUSTRIES INC,ONE MEDLINE PLACE,,MUNDELEIN,IL,60060-4486,1534,right_only
263,,,,,,,,,,,...,6400,521322,SUPRA OFFICE SOLUTIONS INC,PO BOX 201,,BALA CYNWYD,PA,19004,1151,right_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5877,,,,,,,,,,,...,6900,160423,IACLEA,PO BOX 849076,,BOSTON,MA,02284-9076,1650,right_only
5910,,,,,,,,,,,...,6400,100877,APPLE COMPUTER INC,PO BOX 281877,,ATLANTA,GA,30384-1877,12580,right_only
5911,,,,,,,,,,,...,6400,103356,DELL MARKETING LP,DEPT AT 40275,,ATLANTA,GA,31192-0001,12650,right_only
5912,,,,,,,,,,,...,6400,185133,CDW GOVERNMENT INC,75 REMITTANCE DR,SUITE 1515,CHICAGO,IL,60675-1515,1056,right_only
