In [1]:
# This script finds all duplicate resources and their parent (item) or children (asset).
# This is based on a combination of find_child_pid.ipynb and datacite_dupe_doi.ipynb.
#
# Main Processes:
# 1) Import (a) the complete data exported from AC, and (b) a list of current duplicates in AC.
# 2) Select items from (b) that are marked as duplicates ('Yes dupe').
# 3) Look up bulk AC data for dupllicates' child assets.
# 4) Output as 2 CSV, one for Hyacinth, the other one for DataCite. See descriptions below.
#
# First created: 2022-11-10
# Finalized: 2022-11-22
import pandas as pd

In [2]:
# **Import the AC exported full dataset**

df= pd.read_csv('ac_export_data.csv', dtype='string')

# df.head()      # Sample data
# print(df.columns.tolist())     # Total 8092 columns

In [3]:
# **Import the PID list from "Duplicates in AC" to compare with the main list.**

currentACDupe = pd.read_csv('currentPIDList.csv')

# currentACPID_list.head()       # Sample data.

In [4]:
# Extract only the relevent columns from the full AC data to speed up process.

trimmedACData = df[['PID', '_doi', 'Digital Object Type > String Key', 'Title 1 > Sort Portion', 'Parent Digital Object 1 > PID']]
trimmedACData = trimmedACData.drop([0])     # Remove first row of element keys.

trimmedACData.head(3)        # Sample data

Unnamed: 0,PID,_doi,Digital Object Type > String Key,Title 1 > Sort Portion,Parent Digital Object 1 > PID
1,ac:ttdz08kpzn,doi:10.7916/d8-38h8-y303,item,!DNP - DUPLICATE 'Red Listing’ Heritage: Endan...,
2,ac:110981,doi:10.7916/D83X8CXZ,asset,!DNP - DUPLICATE 555.pdf,ac:110983
3,ac:66t1g1jwwx,doi:10.7916/d8-9t26-by02,asset,!DNP - DUPLICATE 617-Article Text-12318-2-10-2...,ac:xksn02v726


In [5]:
# Select rows that are marked as duplicates from the "Duplicate in AC" CSV.

currentACDupe = currentACDupe[currentACDupe['YES dupe'] == True].reset_index(drop=True)

# Create a list of duplicate PIDs.

currentDupePID = currentACDupe['delete--PID'].tolist()

currentACDupe.head(3)
# print(len(currentDupePID))       # Check how many PID in total need to work on. (106 as of 11/19)

Unnamed: 0,Test row?,YES dupe,Ignore,Further review,delete--PID,delete--DOI,delete--first_published,other--PID,other--DOI,Remark,keep--PID,keep--DOI,OpenRefine Dupe,NotDupes,ConfirmedDupe,OR Digital Object Type > String Key,OR Title 1 > Sort Portion,OR Internal Note 1 > Value,OR Note 1 > Type,OR Note 1 > Value
0,True,True,False,False,ac:4f4qrfj6t4,doi:10.7916/d8-3b81-q793,2020-08-10T17:13:28Z,,,,ac:kd51c5b01p,doi:10.7916/d8-vynf-tk42,True,,,item,!DNP - DUPLICATE Red Listing’ Heritage: Endang...,,,
1,True,True,False,False,ac:r2280gb5rw,doi:10.7916/d8-mrbt-wp10,2020-08-10T17:14:24Z,,,,ac:kd51c5b01p,doi:10.7916/d8-vynf-tk42,True,,,item,!DNP - DUPLICATE Red Listing’ Heritage: Endang...,,,
2,False,True,False,False,ac:ttdz08kpzn,doi:10.7916/d8-38h8-y303,Mon Aug 10 2020 13:15:13 GMT-0400 (Eastern Day...,,,,ac:kd51c5b01p,doi:10.7916/d8-vynf-tk42,,,,item,!DNP - DUPLICATE Red Listing’ Heritage: Endang...,,,


In [6]:
# If any resource in the current dupe list is a parent item, find all of its children (other assets under that parent item).
# These new found children will join the current dupe list to be worked on.
# New list shows: Child index - Child PID - Child Resource Type - Child Title - Parent PID
# A parent PID can be repeated on multiple rows if having more than one child

# isin() checks each value of the currentDupePID[] if exist in the parent object column in bulk AC data.
childofDupeParent = trimmedACData[trimmedACData['Parent Digital Object 1 > PID'].isin(currentDupePID)]
childofDupeParent.columns = ['PID', 'DOI', 'Object Type', 'Title', 'Parent PID']

childofDupeParent.head()       
# print(len(ChildofDupeParent))     # Number of parent items found. (96 as of 11/19)

Unnamed: 0,PID,DOI,Object Type,Title,Parent PID
5,ac:hx3ffbg7f7,doi:10.7916/d8-17e7-gx14,asset,!DNP - DUPLICATE 719-Article Text-16086-1-10-2...,ac:f1vhhmgqrw
35,ac:149517,doi:10.7916/D8HH6VDV,asset,!DNP - DUPLICATE hotdep.pdf,ac:149516
63,ac:h9w0vt4bf0,doi:10.7916/d8-dky4-9y46,asset,!DNP Duplicate 5402-Article Text-9484-1-10-202...,ac:3bk3j9kd83
109,ac:110917,doi:10.7916/D8BC46CV,asset,!DNP Duplicate cucs-018-08.pdf,ac:110919
535,ac:138654,doi:10.7916/D8T443JS,asset,01798.pdf,ac:138653


In [7]:
# Relate new parent items' PID and DOI information for the child assets found in last step.
# The new column 'DOI to Map to' will be the DOIs that the duplicates should be redirected to.
# Information of new DOIs comes from the current AC duplicate list.

parentNewDOI = currentACDupe[['delete--PID', 'keep--PID', 'keep--DOI']]

childofDupeParent = childofDupeParent.merge(parentNewDOI, how='left', left_on='Parent PID', right_on='delete--PID', sort=True)
childofDupeParent = childofDupeParent.drop(columns=['delete--PID'])
childofDupeParent = childofDupeParent.rename(columns={'keep--PID': 'New PID', 'keep--DOI': 'DOI to Map to'})

childofDupeParent.head()

Unnamed: 0,PID,DOI,Object Type,Title,Parent PID,New PID,DOI to Map to
0,ac:107680,doi:10.7916/D8VM4KQZ,asset,WP_222.pdf,ac:107682,ac:115911,doi:10.7916/D8G452GR
1,ac:110417,doi:10.7916/D88D07CG,asset,358.pdf,ac:110419,ac:127054,doi:10.7916/D8XS623S
2,ac:137983,doi:10.7916/D84M9GPN,asset,cucs-036-05.pdf,ac:110419,ac:127054,doi:10.7916/D8XS623S
3,ac:110684,doi:10.7916/D8NZ8GF7,asset,cucs-011-07.pdf,ac:110686,ac:125654,doi:10.7916/D8MS40GB
4,ac:110917,doi:10.7916/D8BC46CV,asset,!DNP Duplicate cucs-018-08.pdf,ac:110919,ac:125632,doi:10.7916/D8NS11MQ


In [8]:
# Merge duplicate parents' children data to currentACDupe
# Parent and chilren are listed together
# Resulting list contains:
# DOI - PID - Object Type - Title - New PID - New DOI (to map to)

currentACDupSubset = currentACDupe[['delete--DOI', 'delete--PID', 'OR Digital Object Type > String Key', 'OR Title 1 > Sort Portion', 'keep--PID', 'keep--DOI']]
currentACDupSubset.columns = ['DOI', 'PID', 'Object Type', 'Title', 'New PID', 'DOI to Map to']
# currentACDupSubset

final = pd.concat([currentACDupSubset, childofDupeParent], ignore_index=True)

final = final.drop(columns=['Parent PID'])
final = final.sort_values(by='PID')
final.reset_index(inplace=True, drop=True)
final = final.replace(r'\r', r'', regex=True)   # Remove accidental new lines from values

# Reformat 'keep--DOI' as actual URL

final['DOI to Map to'] = final['DOI to Map to'].str.replace(r'doi:', 'https://academiccommons.columbia.edu/doi/', regex=True)

#final.to_csv('all_dupe_and_related.csv')
final

Unnamed: 0,DOI,PID,Object Type,Title,New PID,DOI to Map to
0,,ac:05qfttdz2r,asset,mets.xml,ac:x3ffbg79jr,https://academiccommons.columbia.edu/doi/10.79...
1,doi:10.7916/d8-ppp7-1s08,ac:08kprr4xkn,item,!DNP DUPLICATE “Addis Ababa Bete (Home)”: Cont...,ac:jq2bvq83gg,https://academiccommons.columbia.edu/doi/10.79...
2,,ac:0gb5mkkwkn,asset,Opening Ceremony 2007 Photo only.pdf,ac:h70rxwdbwx,https://academiccommons.columbia.edu/doi/10.79...
3,,ac:0gb5mkkwmb,asset,mets.xml,ac:05qfttdz2s,https://academiccommons.columbia.edu/doi/10.79...
4,doi:10.7916/D8VM4KQZ,ac:107680,asset,WP_222.pdf,ac:115911,https://academiccommons.columbia.edu/doi/10.79...
...,...,...,...,...,...,...
197,,ac:wdbrv15dzq,item,Evidence of recent volcanic activity on the ul...,ac:pc866t1g4v,https://academiccommons.columbia.edu/doi/10.79...
198,,ac:xd2547d82k,item,Cognitive training and neuroplasticity in mild...,ac:x3ffbg79jr,https://academiccommons.columbia.edu/doi/10.79...
199,,ac:zcrjdfn33g,asset,mets.xml,ac:h70rxwdbwx,https://academiccommons.columbia.edu/doi/10.79...
200,,ac:zgmsbcc2m2,asset,Graduation 1999 Academic Dress.pdf,ac:h70rxwdbwx,https://academiccommons.columbia.edu/doi/10.79...


In [9]:
# Sort and export the list to work on Hyacinth
# The first column is PID

hyacinthList = final[['PID', 'DOI', 'Title', 'Object Type', 'New PID', 'DOI to Map to']]

# Export as CSV file
hyacinthList.to_csv('all_dupe_Hyacinth.csv')

hyacinthList.head()

Unnamed: 0,PID,DOI,Title,Object Type,New PID,DOI to Map to
0,ac:05qfttdz2r,,mets.xml,asset,ac:x3ffbg79jr,https://academiccommons.columbia.edu/doi/10.79...
1,ac:08kprr4xkn,doi:10.7916/d8-ppp7-1s08,!DNP DUPLICATE “Addis Ababa Bete (Home)”: Cont...,item,ac:jq2bvq83gg,https://academiccommons.columbia.edu/doi/10.79...
2,ac:0gb5mkkwkn,,Opening Ceremony 2007 Photo only.pdf,asset,ac:h70rxwdbwx,https://academiccommons.columbia.edu/doi/10.79...
3,ac:0gb5mkkwmb,,mets.xml,asset,ac:05qfttdz2s,https://academiccommons.columbia.edu/doi/10.79...
4,ac:107680,doi:10.7916/D8VM4KQZ,WP_222.pdf,asset,ac:115911,https://academiccommons.columbia.edu/doi/10.79...


In [10]:
# Export the list to work on DataCite.
# Those without a DOI will be removed.

final_for_DataCite = final.dropna(subset=['DOI'])

# Swapping the columns of PID and DOI
final_for_DataCite = final_for_DataCite.reindex(columns=['DOI', 'PID', 'Object Type', 'DOI to Map to'])
final_for_DataCite.reset_index(inplace=True, drop=True)

# Export as CSV file
final_for_DataCite.to_csv('all_dupe_Datacite.csv')
# len(final_for_DataCite)     # Should be less than the list for Hyacinth