In [1]:
import pandas as pd

In [2]:
# Load the data
harvard_asins = pd.read_pickle('./data/pfw/01_unique_harvard_asins.pkl')
ucsd_asins = pd.read_pickle('./data/pfw/01_unique_ucsd_asins.pkl')

# Ensure the data is in DataFrame format
if isinstance(harvard_asins, pd.Series):
    harvard_asins = harvard_asins.to_frame()
if isinstance(ucsd_asins, pd.Series):
    ucsd_asins = ucsd_asins.to_frame()

# Rename the first column to 'asin'
harvard_asins.rename(columns={harvard_asins.columns[0]: 'asin'}, inplace=True)
ucsd_asins.rename(columns={ucsd_asins.columns[0]: 'asin'}, inplace=True)

# Add the 'source' column
# harvard_asins['source'] = 'harvard'
# ucsd_asins['source'] = 'ucsd'

# Combine the DataFrames
all_asins = pd.concat([harvard_asins, ucsd_asins])

# Remove duplicates based on the 'asin' column
unique_asins = all_asins.drop_duplicates(subset='asin').copy()

# Add the 'is_reviewed' column and set all values to 0
# unique_asins['is_reviewed'] = 0

# Save the result to a new pkl file
unique_asins.to_pickle(path='./data/pfw/02_combined_unique_asins.pkl')

print("Combined unique ASINs have been written to 'combined_unique_asins.pkl'.")


Combined unique ASINs have been written to 'combined_unique_asins.pkl'.


In [3]:
# Load the data
harvard_asins = pd.read_pickle("./data/pfw/unique_harvard_asins.pkl")
# if isinstance(harvard_asins, pd.Series):
#     harvard_asins = harvard_asins.to_frame()
    
ucsd_asins = pd.read_pickle("./data/pfw/unique_ucsd_asins.pkl")
# if isinstance(ucsd_asins, pd.Series):
#     ucsd_asins = ucsd_asins.to_frame()

pfw_asins = pd.read_pickle(filepath_or_buffer="./data/pfw/combined_unique_asins.pkl")
pfw_asins["harvard"] = 0
pfw_asins["ucsd"] = 0

if isinstance(pfw_asins, pd.Series):
    pfw_asins = pfw_asins.to_frame()


# Assuming harvard_asins and ucsd_asins are Series or DataFrames with an 'asin' column
# Convert to sets for efficient lookup
harvard_set = set(
    harvard_asins.values
    if isinstance(harvard_asins, pd.Series)
    else harvard_asins["asin"].values
)
ucsd_set = set(
    ucsd_asins.values
    if isinstance(ucsd_asins, pd.Series)
    else ucsd_asins["asin"].values
)

# Set the flags using boolean indexing
pfw_asins.loc[pfw_asins["asin"].isin(harvard_set), "harvard"] = 1
pfw_asins.loc[pfw_asins["asin"].isin(ucsd_set), "ucsd"] = 1

# Optionally, verify the results
print("Total ASINs:", len(pfw_asins))
print("Harvard matches:", pfw_asins["harvard"].sum())
print("UCSD matches:", pfw_asins["ucsd"].sum())
print(
    "ASINs in both:",
    len(pfw_asins[(pfw_asins["harvard"] == 1) & (pfw_asins["ucsd"] == 1)]),
)


Total ASINs: 5335156
Harvard matches: 939083
UCSD matches: 4446065
ASINs in both: 49992


In [4]:
pfw_asins.head()


Unnamed: 0,asin,harvard,ucsd
0,B0143RTB1E,1,0
1,B01MA1MJ6H,1,0
2,B078JZTFN3,1,0
3,B06XWF9HML,1,0
4,B00837ZOI0,1,0


In [5]:
pfw_asins.tail()

Unnamed: 0,asin,harvard,ucsd
28977252,1853672831,0,1
28977399,764120964,0,1
29017781,71455531,0,1
29089791,609806483,0,1
29113822,596514522,0,1


In [6]:
# pfw_asins.to_csv('./data/pfw/02_combined_unique_asins_final.csv', index=False)
pfw_asins.to_pickle('./data/pfw/02_combined_unique_asins_final.pkl')