# Setup

In [1]:
import pandas as pd

TRANSACTION_INDEX = ['RC#', 'Category', 'Vendor#']

def get_tpn_id(i, df):
    rc = df.at[i, 'RC#']
    category = df.at[i, 'Category']
    vendor = df.at[i, 'Vendor#']

    relevant_transactions = df[
        (df['RC#'] == rc) &
        (df['Category'] == category) &
        (df['Vendor#'] == vendor)
    ]

    counter = 0

    if len(relevant_transactions) == 0 or len(relevant_transactions) == 1:
        counter = 1
    else:
        if relevant_transactions.at[0, 'Amount'] == df.at[i, 'Amount']:
            counter = 1
        else:
            counter = int(df.at[i - 1, 'TPN_ID'].split('-')[3]) + 1

    tpn_id = (
        rc.astype(str),
        category.astype(str),
        vendor.astype(str),
        str(counter)
    )
    
    return '-'.join(tpn_id)

In [2]:
original = pd.read_csv("./data/original.csv").replace({ 'RC#': { 2: 1 }, 'RC Name': { 'SVC Engagement': 'Chancellor' } })

In [3]:
# If multiple RC transactions have the same category + vendor, merge them

original = original.groupby(['RC#', 'Category', 'Vendor#']).agg({
    # 'RC#': 'first',
    'RC Name': 'first',
    # 'Category': 'first',
    # 'Vendor#': 'first',
    'Vendor Name': 'first',
    'Address 1': 'first',
    'Address 2': 'first',
    'City': 'first',
    'State': 'first',
    'ZIP code': 'first',
    'Amount': 'sum',
}).reset_index()

In [4]:
original = original[[
    'RC#', 'RC Name', 'Category', 'Vendor#', 'Vendor Name', 'Address 1', 'Address 2', 'City', 'State', 'ZIP code', 'Amount'
]]

original = original.sort_values(by=TRANSACTION_INDEX)

for i in original.index:
    original.at[i, 'TPN_ID'] = get_tpn_id(i, original)

In [5]:
original.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5568 entries, 0 to 5567
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   RC#          5568 non-null   int64 
 1   RC Name      5568 non-null   object
 2   Category     5568 non-null   int64 
 3   Vendor#      5568 non-null   int64 
 4   Vendor Name  5568 non-null   object
 5   Address 1    5544 non-null   object
 6   Address 2    1290 non-null   object
 7   City         5469 non-null   object
 8   State        5371 non-null   object
 9   ZIP code     5557 non-null   object
 10  Amount       5568 non-null   int64 
 11  TPN_ID       5568 non-null   object
dtypes: int64(4), object(8)
memory usage: 694.5+ KB


In [6]:
updated = pd.read_csv("./data/updated.csv")
updated = updated.sort_values(by=TRANSACTION_INDEX)

In [7]:
for i in updated.index:
    updated.at[i, 'TPN_ID'] = get_tpn_id(i, updated)

In [8]:
updated.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5920 entries, 4352 to 1519
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   RC#          5920 non-null   int64 
 1   RC Name      5920 non-null   object
 2   Category     5920 non-null   int64 
 3   Vendor#      5920 non-null   int64 
 4   Vendor Name  5920 non-null   object
 5   Address 1    5896 non-null   object
 6   Address 2    1359 non-null   object
 7   City         5821 non-null   object
 8   State        5716 non-null   object
 9   ZIP code     5909 non-null   object
 10  Amount       5920 non-null   int64 
 11  TPN_ID       5920 non-null   object
dtypes: int64(4), object(8)
memory usage: 730.3+ KB


In [9]:
original.to_csv("./data/original_sorted.csv", index=False)

In [10]:
updated.to_csv("./data/updated_sorted.csv", index=False)