# Data Deduplication

Remove duplicate descriptions from the exported data for manual review.

In [1]:
# Libraries for data analysis
import pandas as pd
import re

# Custom variables
import config

# For reading and writing data
from pathlib import Path
import os

### Get the Files and Directory

In [2]:
unclf_dir = "data/extracted/for_review/unclassified/"
file_names = os.listdir(unclf_dir)
print(file_names)

['THS_unclassified.csv', 'OBR_unclassified.csv', '.DS_Store', 'BXB_unclassified.csv', 'GB_unclassified.csv', 'WCT_unclassified.csv', 'deduplicated', 'CPT_unclassified.csv', 'SW_unclassified.csv', 'CHE_unclassified.csv', 'SH_unclassified.csv', 'HL_unclassified.csv', 'BP_unclassified.csv']


In [6]:
# Use the dataset of extracted, standardized data to create a new, deduplicated file of Gertude Bell Archive data 
unclf_data_file = "data/extracted/nusc_archival_descs_for_clf.csv"

Create a directory for the deduplicated, unclassified descriptions.

In [4]:
dedup_dir = "data/extracted/for_review/unclassified/deduplicated"
Path(dedup_dir).mkdir(parents=True, exist_ok=True)

### Deduplicate the descriptions

In [5]:
def implodeDataFrame(df, cols_to_groupby):
    cols_to_agg = list(df.columns)
    for col in cols_to_groupby:
        cols_to_agg.remove(col)
    agg_dict = dict.fromkeys(cols_to_agg, lambda x: x.tolist())
    return df.groupby(cols_to_groupby).agg(agg_dict).reset_index().set_index(cols_to_groupby)

First, deduplicate the EAD-formatted descriptions.

In [26]:
row_count = 0
all_descs = []
for f in file_names:
    if "_unclassified.csv" in f:
        print("Reading", f)
        # Read the file as a DataFrame
        df = pd.read_csv(unclf_dir+f, index_col=0)

        # Remove the empty column (where manual reviewers will make notes)
        df = df.drop(columns=["gender_bias?"])
        imploded = implodeDataFrame(df, ["doc","eadid"]).reset_index()
        print(df.shape, imploded.shape)
        row_count += imploded.shape[0]
        all_descs += list(imploded["doc"])
        assert imploded.shape[0] < df.shape[0], "There should be fewer rows in the DataFrame after imploding it."

        # Add a new empty column for manual reviewers to record their notes
        empty_col = [""]*imploded.shape[0]
        imploded.insert(len(imploded.columns), "gender_bias?", empty_col)
        
        # Reorder the columns
        imploded = imploded[["doc", "gender_bias?", "eadid", "description_id", "rowid", "field"]]

        # Write the imploded DataFrame into the newly created directory
        imploded.to_csv(dedup_dir+"/"+f)
print("Total rows:", row_count)

Reading THS_unclassified.csv
(10851, 5) (4865, 5)
Reading OBR_unclassified.csv
(532, 5) (455, 5)
Reading BXB_unclassified.csv
(10884, 5) (6137, 5)
Reading GB_unclassified.csv
(428, 5) (32, 5)
Reading WCT_unclassified.csv
(1122, 5) (1065, 5)
Reading CPT_unclassified.csv
(6885, 5) (4378, 5)
Reading SW_unclassified.csv
(2114, 5) (1642, 5)
Reading CHE_unclassified.csv
(268, 5) (266, 5)
Reading SH_unclassified.csv
(360, 5) (290, 5)
Reading HL_unclassified.csv
(219, 5) (211, 5)
Reading BP_unclassified.csv
(562, 5) (508, 5)
Total rows: 19849


In [23]:
print(os.listdir(dedup_dir))

['THS_unclassified.csv', 'OBR_unclassified.csv', 'BXB_unclassified.csv', 'GB_unclassified.csv', 'WCT_unclassified.csv', 'CPT_unclassified.csv', 'SW_unclassified.csv', 'CHE_unclassified.csv', 'SH_unclassified.csv', 'HL_unclassified.csv', 'BP_unclassified.csv']


In [None]:
word_count = 0
for desc in all_descs:
    word_count += len(desc)
print("Total words:", word_count)  # about 2 million

Total words: 2188536


Next, deduplicate the custom formatted descriptions (for the Gertrude Bell Archive).

In [10]:
df = pd.read_csv(unclf_data_file, index_col=0)
df = df.loc[df.eadid == "GB"]
print(df.shape)
df.head()


(24150, 5)


Unnamed: 0,description_id,eadid,rowid,field,doc
18599,18599,GB,69571,extent-and-medium,"1 letter, paper"
18600,18600,GB,69571,title,"Letter from Gertrude Bell to her stepmother, D..."
18601,18601,GB,69572,extent-and-medium,"1 letter plus envelope, paper"
18602,18602,GB,69572,title,"Letter from Gertrude Bell to her stepmother, D..."
18603,18603,GB,69573,extent-and-medium,"1 letter, paper"


In [None]:
# Implode the DataFrame of Bell descriptions, grouping the data by doc (description)
imploded = implodeDataFrame(df, ["doc","eadid"]).reset_index()
print(df.shape, imploded.shape)
row_count = imploded.shape[0]
all_descs = list(imploded["doc"])
assert imploded.shape[0] < df.shape[0], "There should be fewer rows in the DataFrame after imploding it."

# Add a new empty column for manual reviewers to record their notes
empty_col = [""]*imploded.shape[0]
imploded.insert(len(imploded.columns), "gender_bias?", empty_col)

# Reorder the columns
imploded = imploded[["doc", "gender_bias?", "eadid", "description_id", "rowid", "field"]]

print("Total rows:", row_count)

(24150, 5) (7418, 5)
Total rows: 7418


In [13]:
word_count = 0
for desc in all_descs:
    word_count += len(desc)
print("Total words:", word_count)  # about 60k, making the grand total about 2.7 million words (2188536 + 601452 = 2789988)

Total words: 601452


In [16]:
imploded.head()

Unnamed: 0,doc,gender_bias?,eadid,description_id,rowid,field
0,"""Charonion,"" Head Of A Sphinx - Rock Cut Sculp...",,GB,"[39609, 39611, 39661]","[116041, 116042, 116067]","[description, description, description]"
1,"""Gate Of The Winds"" Triumphal Arch. Nejib And ...",,GB,[36388],[114427],[description]
2,"""The Spring Of The Deer"". Men, Camels And Hors...",,GB,[34903],[113682],[description]
3,"""Throne Of Nimrud"" - Corinthian Column Thought...",,GB,[38925],[115698],[description]
4,"""Throne Of Nimrud"" - Two Corinthian Columns Th...",,GB,[38921],[115696],[description]


In [20]:
# Write the imploded DataFrame into the newly created directory
imploded.to_csv(dedup_dir+"/"+"GB_unclassified.csv")