# Data Deduplication

Remove duplicate descriptions from the exported data for manual review.

In [1]:
# Libraries for data analysis
import pandas as pd
import re

# Custom variables
import config

# For reading and writing data
from pathlib import Path
import os

In [2]:
unclf_dir = "data/extracted/for_review/unclassified/"
file_names = os.listdir(unclf_dir)
print(file_names)

['THS_unclassified.csv', 'OBR_unclassified.csv', '.DS_Store', 'BXB_unclassified.csv', 'GB_unclassified.csv', 'WCT_unclassified.csv', 'deduplicated', 'CPT_unclassified.csv', 'SW_unclassified.csv', 'CHE_unclassified.csv', 'SH_unclassified.csv', 'HL_unclassified.csv', 'BP_unclassified.csv']


Create a directory for the deduplicated, unclassified descriptions.

In [3]:
dedup_dir = "data/extracted/for_review/unclassified/deduplicated"
Path(dedup_dir).mkdir(parents=True, exist_ok=True)

Deduplicate the descriptions.

In [4]:
def implodeDataFrame(df, cols_to_groupby):
    cols_to_agg = list(df.columns)
    for col in cols_to_groupby:
        cols_to_agg.remove(col)
    agg_dict = dict.fromkeys(cols_to_agg, lambda x: x.tolist())
    return df.groupby(cols_to_groupby).agg(agg_dict).reset_index().set_index(cols_to_groupby)

In [22]:
row_count = 0
all_descs = []
for f in file_names:
    if "_unclassified.csv" in f:
        print("Reading", f)
        # Read the file as a DataFrame
        df = pd.read_csv(unclf_dir+f, index_col=0)

        # Remove the empty column (where manual reviewers will make notes)
        df = df.drop(columns=["gender_bias?"])
        imploded = implodeDataFrame(df, ["doc","eadid"]).reset_index()
        print(df.shape, imploded.shape)
        row_count += imploded.shape[0]
        all_descs += list(imploded["doc"])
        assert imploded.shape[0] < df.shape[0], "There should be fewer rows in the DataFrame after imploding it."

        # Add a new empty column for manual reviewers to record their notes
        empty_col = [""]*imploded.shape[0]
        imploded.insert(len(imploded.columns), "gender_bias?", empty_col)
        # imploded.head()

        # Write the imploded DataFrame into the newly created directory
        imploded.to_csv(dedup_dir+"/"+f)
print("Total rows:", row_count)

Reading THS_unclassified.csv
(10851, 5) (4865, 5)
Reading OBR_unclassified.csv
(532, 5) (455, 5)
Reading BXB_unclassified.csv
(10884, 5) (6137, 5)
Reading GB_unclassified.csv
(428, 5) (32, 5)
Reading WCT_unclassified.csv
(1122, 5) (1065, 5)
Reading CPT_unclassified.csv
(6885, 5) (4378, 5)
Reading SW_unclassified.csv
(2114, 5) (1642, 5)
Reading CHE_unclassified.csv
(268, 5) (266, 5)
Reading SH_unclassified.csv
(360, 5) (290, 5)
Reading HL_unclassified.csv
(219, 5) (211, 5)
Reading BP_unclassified.csv
(562, 5) (508, 5)
Total rows: 19849


In [23]:
print(os.listdir(dedup_dir))

['THS_unclassified.csv', 'OBR_unclassified.csv', 'BXB_unclassified.csv', 'GB_unclassified.csv', 'WCT_unclassified.csv', 'CPT_unclassified.csv', 'SW_unclassified.csv', 'CHE_unclassified.csv', 'SH_unclassified.csv', 'HL_unclassified.csv', 'BP_unclassified.csv']


In [None]:
word_count = 0
for desc in all_descs:
    word_count += len(desc)
print("Total words:", word_count)  # about 2 million

Total words: 2188536
