In [2]:
# Example Notebook file demonstrating how to use the file structure
from utils.preprocess_util_lib_example import save_random_dataframe
from pathlib import Path

save_random_dataframe(Path("../output"), Path("test.csv"))

In [1]:
def determine_comma_role(name: str) -> str:
    """Given a string (someone's name), attempts to determine the role of the
    comma in the name and where it ought to belong.

    Some assumptions are made:
        * If a suffix is included in the name and the name is not just the last
          name(i.e "Doe, Jr), the format is
          (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth

        * If a comma is used anywhere else, it is in the format of
          (last_name, first and middle name) i.e Doe, Jane Elisabeth
    Args:
        name: a string representing a name/names of individuals
    Returns:
        the name with or without a comma based on some conditions
    """
    suffixes = [
        "sr",
        "jr",
        "i",
        "ii",
        "iii",
        "iv",
        "v",
        "vi",
        "vii",
        "viii",
        "ix",
        "x",
    ]
    name_parts = name.lower().split(",")
    # if the comma is just in the end as a typo:
    if len(name_parts[1]) == 0:
        return name_parts[0].title()
    # if just the suffix in the end, leave the name as it is
    if name_parts[1].strip() in suffixes:
        return name.title()
    # at this point either it's just poor name placement, or the suffix is
    # in the beginning of the name. Either way, the first part of the list is
    # the true last name.
    last_part = name_parts.pop(0)
    first_part = " ".join(name_parts)
    return first_part.title() + " " + last_part.title()

In [2]:
determine_comma_role("DOe, Jane, Jr")

' Jane  Jr Doe'

In [3]:
def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
    """Given name related columns, return a person's likely name

    Given different formatting used accross states, errors in data entry
    and missing data, it can be difficult to determine someone's actual
    name. For example, some states have a last name column with values like
    "Doe, Jane", where the person's first name appears to have been erroneously
    included.

    Args:
        first_name: raw value of first name column
        last_name: raw value last name column
        full_name: raw value of name or full_name column
    Returns:
        The most likely full name of the person listed

    Sample Usage:
    >>> get_likely_name("Jane", "Doe", "")
    'Jane Doe'
    >>> get_likely_name("", "", "Jane Doe")
    'Jane Doe'
    >>> get_likely_name("", "Doe, Jane", "")
    'Jane Doe'
    >>> get_likely_name("Jane Doe", "Doe", "Jane Doe")
    'Jane Doe'
    >>> get_likely_name("Jane","","Doe, Sr")
    'Jane Doe, Sr'
    >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV")
    'Jane Elisabeth Doe, Iv'
    >>> get_likely_name("","","Jane Elisabeth Doe, IV")
    'Jane Elisabeth Doe Iv'
    """
    # first ensure clean input by deleting spaces:
    first_name, last_name, full_name = list(
        map(lambda x: x.lower().strip(), [first_name, last_name, full_name])
    )

    # if data is clean:
    if first_name + " " + last_name == full_name:
        return full_name

    # some names have titles or professions associated with the name. We need to
    # remove those from the name.
    titles = [
        "mr",
        "ms",
        "mrs",
        "miss",
        "prof",
        "dr",
        "doctor",
        "sir",
        "madam",
        "professor",
    ]
    names = [first_name, last_name, full_name]

    for i in range(len(names)):
        # if there is a ',' deal with it accordingly
        if "," in names[i]:
            names[i] = determine_comma_role(names[i])

        names[i] = names[i].replace(".", "").split(" ")
        names[i] = [
            name_part for name_part in names[i] if name_part not in titles
        ]
        names[i] = " ".join(names[i])

    # one last check to remove any pieces that might add extra whitespace
    names = list(filter(lambda x: x != "", names))
    names = " ".join(names)
    names = names.title().replace("  "," ").split(" ")
    final_name = []
    [final_name.append(x) for x in names if x not in final_name]
    return " ".join(final_name).strip()

In [4]:
import pandas as pd
orgs = pd.read_csv("../output/complete_organizations_table.csv")
type(orgs.id.tolist()[1000])

str

In [5]:
data = {'id':['50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360',
              '62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',
              'd31df1ca-714e-4a82-9e88-1892c0451a71','d31df1ca-714e-4a82-9e88-1892c0451a71','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',
              '4db76e6e-f0d5-40eb-82de-6dbcdb562dd7','f71341d7-d27e-47eb-9b66-903af39d6cb5','c875d7de-94be-42f1-b994-dd89b114d51e',
              '910c4d36-b036-469e-aa2a-ea4ff8855a6c','60d454d1-3773-4d88-80e9-132c161da0f0','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd',
              '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe','1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff',
              '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd'],
        'name':['REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC',
                'REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',
                'UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',
                'COMMITTEE TO ELECT DR PATRICIA BERNARD','COMMITTEE TO ELECT DR PATRICIA BERNARD','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',
                'Ugi Utilities Inc/Ugi Energy Services Llc Pac','Pabar Pac (Pa Bar Assn)','Pa Fraternal Order Of Police Pac','Citizens For Kail',
                'Paa Pac','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC',
                'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','Paa Pac'],
        'state':['MI','MI','MI','MI','MI','MI','MI','MI','MI','PA','PA','PA','PA','PA','MI','MI','MI','MI','PA'],
        'entity_type':['committee','committee','committee','committee','committee','committee','committee','committee','committee',
                       'Organization','Organization','Organization','Organization','Organization','committee','committee','committee','committee','Organization']}

sample_df = pd.DataFrame(data)
sample_df

Unnamed: 0,id,name,state,entity_type
0,50c7d9a1-b448-46a5-8e2d-cd15b3097360,REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...,MI,committee
1,50c7d9a1-b448-46a5-8e2d-cd15b3097360,REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...,MI,committee
2,50c7d9a1-b448-46a5-8e2d-cd15b3097360,REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...,MI,committee
3,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
4,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
5,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
6,d31df1ca-714e-4a82-9e88-1892c0451a71,COMMITTEE TO ELECT DR PATRICIA BERNARD,MI,committee
7,d31df1ca-714e-4a82-9e88-1892c0451a71,COMMITTEE TO ELECT DR PATRICIA BERNARD,MI,committee
8,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
9,4db76e6e-f0d5-40eb-82de-6dbcdb562dd7,Ugi Utilities Inc/Ugi Energy Services Llc Pac,PA,Organization


In [6]:
def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
    '''Given a dataframe, remove rows that have identical entry data beyond
    UUIDs, and output a file mapping an entry to other the UUIDs of the
    deduplicated rows
    
    Args:
        a pandas dataframe containing contribution data
    Returns:
        a deduplicated pandas dataframe containing contribution data
    '''
    #first remove all duplicate entries:
    new_df = df.drop_duplicates()

    # now find the duplicates along all columns but the ID
    cols = new_df.columns[1:]
    duplicates = new_df[new_df.duplicated(cols)]        
    new_df = new_df.drop(index=duplicates.index.tolist())
    #for index in duplicates.index:

    return new_df

In [7]:
x = deduplicate_perfect_matches(sample_df)
for i in range(len(x)):
    curr_row = x.loc[i]
    sample_df.loc[(sample_df.name == 'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC') &
#              (sample_df.state == 'MI') &
#               (sample_df.entity_type == 'committee')]
x


Unnamed: 0,id,name,state,entity_type
16,1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe,MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC,MI,committee
17,1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff,MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC,MI,committee
18,1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd,Paa Pac,PA,Organization


In [13]:
x = sample_df.groupby(sample_df.columns[1:].tolist()).count().reset_index()
x.loc[x.id >1]

Unnamed: 0,name,state,entity_type,id
0,COMMITTEE TO ELECT DR PATRICIA BERNARD,MI,committee,2
2,MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC,MI,committee,4
4,Paa Pac,PA,Organization,2
6,REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...,MI,committee,3
7,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee,4


['name', 'state', 'entity_type']

In [None]:
from utils.constants import repo_root
entities.to_csv(repo_root / "output" / "deduplicated_UUIDs.csv", index=False)
