In [2]:
# Example Notebook file demonstrating how to use the file structure
from utils.preprocess_util_lib_example import save_random_dataframe
from pathlib import Path

save_random_dataframe(Path("../output"), Path("test.csv"))

In [1]:
def determine_comma_role(name: str) -> str:
    """Given a string (someone's name), attempts to determine the role of the
    comma in the name and where it ought to belong.

    Some assumptions are made:
        * If a suffix is included in the name and the name is not just the last
          name(i.e "Doe, Jr), the format is
          (last_name suffix, first and middle name) i.e Doe iv, Jane Elisabeth

        * If a comma is used anywhere else, it is in the format of
          (last_name, first and middle name) i.e Doe, Jane Elisabeth
    Args:
        name: a string representing a name/names of individuals
    Returns:
        the name with or without a comma based on some conditions
    """
    suffixes = [
        "sr",
        "jr",
        "i",
        "ii",
        "iii",
        "iv",
        "v",
        "vi",
        "vii",
        "viii",
        "ix",
        "x",
    ]
    name_parts = name.lower().split(",")
    # if the comma is just in the end as a typo:
    if len(name_parts[1]) == 0:
        return name_parts[0].title()
    # if just the suffix in the end, leave the name as it is
    if name_parts[1].strip() in suffixes:
        return name.title()
    # at this point either it's just poor name placement, or the suffix is
    # in the beginning of the name. Either way, the first part of the list is
    # the true last name.
    last_part = name_parts.pop(0)
    first_part = " ".join(name_parts)
    return first_part.title() + " " + last_part.title()

In [2]:
def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
    """Given name related columns, return a person's likely name

    Given different formatting used accross states, errors in data entry
    and missing data, it can be difficult to determine someone's actual
    name. For example, some states have a last name column with values like
    "Doe, Jane", where the person's first name appears to have been erroneously
    included.

    Args:
        first_name: raw value of first name column
        last_name: raw value last name column
        full_name: raw value of name or full_name column
    Returns:
        The most likely full name of the person listed

    Sample Usage:
    >>> get_likely_name("Jane", "Doe", "")
    'Jane Doe'
    >>> get_likely_name("", "", "Jane Doe")
    'Jane Doe'
    >>> get_likely_name("", "Doe, Jane", "")
    'Jane Doe'
    >>> get_likely_name("Jane Doe", "Doe", "Jane Doe")
    'Jane Doe'
    >>> get_likely_name("Jane","","Doe, Sr")
    'Jane Doe, Sr'
    >>> get_likely_name("Jane Elisabeth Doe, IV","Elisabeth","Doe, IV")
    'Jane Elisabeth Doe, Iv'
    >>> get_likely_name("","","Jane Elisabeth Doe, IV")
    'Jane Elisabeth Doe Iv'
    """
    # first ensure clean input by deleting spaces:
    first_name, last_name, full_name = list(
        map(lambda x: x.lower().strip(), [first_name, last_name, full_name])
    )

    # if data is clean:
    if first_name + " " + last_name == full_name:
        return full_name

    # some names have titles or professions associated with the name. We need to
    # remove those from the name.
    titles = [
        "mr",
        "ms",
        "mrs",
        "miss",
        "prof",
        "dr",
        "doctor",
        "sir",
        "madam",
        "professor",
    ]
    names = [first_name, last_name, full_name]

    for i in range(len(names)):
        # if there is a ',' deal with it accordingly
        if "," in names[i]:
            names[i] = determine_comma_role(names[i])

        names[i] = names[i].replace(".", "").split(" ")
        names[i] = [
            name_part for name_part in names[i] if name_part not in titles
        ]
        names[i] = " ".join(names[i])

    # one last check to remove any pieces that might add extra whitespace
    names = list(filter(lambda x: x != "", names))
    names = " ".join(names)
    names = names.title().replace("  "," ").split(" ")
    final_name = []
    [final_name.append(x) for x in names if x not in final_name]
    return " ".join(final_name).strip()

In [3]:
import pandas as pd
orgs_sample = pd.read_csv("../output/complete_organizations_table.csv",index_col=0).sample(10000)
inds_sample = pd.read_csv("../output/complete_individuals_table.csv",index_col=0, low_memory=False).sample(10000)


In [4]:
data = {'id':['50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360','50c7d9a1-b448-46a5-8e2d-cd15b3097360',
              '62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',
              'd31df1ca-714e-4a82-9e88-1892c0451a71','d31df1ca-714e-4a82-9e88-1892c0451a71','62ea1e9c-ac12-400c-b3dc-519389c0f7d3',
              '4db76e6e-f0d5-40eb-82de-6dbcdb562dd7','f71341d7-d27e-47eb-9b66-903af39d6cb5','c875d7de-94be-42f1-b994-dd89b114d51e',
              '910c4d36-b036-469e-aa2a-ea4ff8855a6c','60d454d1-3773-4d88-80e9-132c161da0f0','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd',
              '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd','1d2b5bc0-9385-4cd7-ac48-df43b3eca6fe','1d2b5bc0-9385-4cd7-ac48-df43b3eca6ff',
              '1d2b5bc0-9385-4cd7-ac48-df43b3eca6fd'],
        'name':['REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC',
                'REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN PAC','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',
                'UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',
                'COMMITTEE TO ELECT DR PATRICIA BERNARD','COMMITTEE TO ELECT DR PATRICIA BERNARD','UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALLOT CLUB',
                'Ugi Utilities Inc/Ugi Energy Services Llc Pac','Pabar Pac (Pa Bar Assn)','Pa Fraternal Order Of Police Pac','Citizens For Kail',
                'Paa Pac','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC',
                'MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','MICHIGAN ASSOCIATION OF NURSE ANESTHETISTS PAC','Paa Pac'],
        'state':['MI','MI','MI','MI','MI','MI','MI','MI','MI','PA','PA','PA','PA','PA','MI','MI','MI','MI','PA'],
        'entity_type':['committee','committee','committee','committee','committee','committee','committee','committee','committee',
                       'Organization','Organization','Organization','Organization','Organization','committee','committee','committee','committee','Organization']}

sample_df = pd.DataFrame(data)
sample_df

Unnamed: 0,id,name,state,entity_type
0,50c7d9a1-b448-46a5-8e2d-cd15b3097360,REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...,MI,committee
1,50c7d9a1-b448-46a5-8e2d-cd15b3097360,REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...,MI,committee
2,50c7d9a1-b448-46a5-8e2d-cd15b3097360,REPUBLICAN STATE LEADERSHIP COMMITTEE MICHIGAN...,MI,committee
3,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
4,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
5,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
6,d31df1ca-714e-4a82-9e88-1892c0451a71,COMMITTEE TO ELECT DR PATRICIA BERNARD,MI,committee
7,d31df1ca-714e-4a82-9e88-1892c0451a71,COMMITTEE TO ELECT DR PATRICIA BERNARD,MI,committee
8,62ea1e9c-ac12-400c-b3dc-519389c0f7d3,UNITED FOOD AND COMMERCIAL WORKERS ACTIVE BALL...,MI,committee
9,4db76e6e-f0d5-40eb-82de-6dbcdb562dd7,Ugi Utilities Inc/Ugi Energy Services Llc Pac,PA,Organization


In [5]:
from utils.constants import repo_root
def convert_duplicates_to_dict(df: pd.DataFrame)->pd.DataFrame:
    '''Takes a dataframe whose indexes are UUIDs, and a column that is a list of
    all other UUIDs that have duplicate values. The function then outputs a
    dictionary file where the deduped UUIDs map to the dataframe main UUID
    
    Args:
        A pandas dataframe with UUIDs as indexes and deduplicated UUIDs
        matching up to the index in the same row
        
    Returns
        None. However it outputs a dictionary to the output directory, with 2
        columns. The first, which indicates the deduplicated UUIDs, is labeled
        'duplicated_uuids', and the 2nd, which shows the uuids to which the
        deduplicated entries match two, is labeled 'mapped_uuids'.
    '''
    deduped_dict = {}
    for i in range(len(df)):
        deduped_uudis = df.iloc[i]['duplicated']
        for j in range(len(deduped_uudis)):
            deduped_dict.update({deduped_uudis[j]:df.iloc[i]['id']})
    
    # now convert dictionary into a csv file
    deduped_df = pd.DataFrame.from_dict(deduped_dict,'index') 
    deduped_df = deduped_df.reset_index().rename(columns={"index":"duplicated_uuids", 0:"mapped_uuids"})
    deduped_df.to_csv(repo_root / "output" / "deduplicated_UUIDs.csv", index=False, mode='a')


def deduplicate_perfect_matches(df: pd.DataFrame) -> pd.DataFrame:
    '''Given a dataframe, remove rows that have identical entry data beyond
    UUIDs, and output a file mapping an entry to other the UUIDs of the
    deduplicated rows
    
    Args:
        a pandas dataframe containing contribution data
    Returns:
        a deduplicated pandas dataframe containing contribution data
    '''
    #first remove all duplicate entries:
    new_df = df.drop_duplicates()

    # now find the duplicates along all columns but the ID
    new_df=new_df.groupby(df.columns[1:].tolist(),dropna=False)["id"].agg(list).reset_index().rename(columns={"id": "duplicated"})
    new_df.index=new_df["duplicated"].str[0].tolist()
    new_df["duplicated"]=new_df["duplicated"].str[1:]

    # now convert the duplicated column into a dictionary that can will be
    # an output by only feeding the entries with duplicates
    new_df = new_df.reset_index().rename(columns = {'index':'id'})
    convert_duplicates_to_dict(new_df[new_df['duplicated'].apply(lambda x: len(x))>0][['id','duplicated']])
    new_df = new_df.drop(['duplicated'], axis=1)
    return new_df

In [6]:
x = deduplicate_perfect_matches(orgs_sample)
#len(x.iloc[2]['duplicated'])
x

Unnamed: 0,id,name,state,entity_type
0,3246120d-45fc-4d19-adee-d2aa2c5be6db,1 BOLD STEP,MI,corporation
1,8fc7e5d5-558d-42ea-bd9a-8e48a4a9a4bd,12CDRC,MI,corporation
2,a5379930-7324-4f1d-b216-84d9e9ddea40,303 MANAGEMENT INC.,MI,corporation
3,9064112f-ef40-4690-9d0a-782a2375feb0,314 ACTION FUND,MI,corporation
4,9e11e7ae-ee29-4a50-9720-41c6ac556a1f,A T AND T MICHIGAN PAC,MI,corporation
...,...,...,...,...
2149,d79f9729-c9af-4347-868a-ae6e6814a295,Zach Kirk,PA,Organization
2150,fbfea472-e183-4479-b869-90eddfa5198c,Zest Kitchen,PA,Organization
2151,c105a4af-9fd4-4a5b-a7b8-1e8738ff39c6,Zoom Us,PA,Organization
2152,59cc8db9-607e-4e1b-ba41-0850b6019360,Zoom Video Communications Inc.,PA,Organization


In [7]:
y=deduplicate_perfect_matches(inds_sample)
y

Unnamed: 0,id,first_name,last_name,full_name,entity_type,state,party,company
0,f6df631a-e626-4861-b62b-e09512887bd3,A SCOTT,PARIS,A SCOTT PARIS ...,Individual,MI,,NOT EMPLOYED
1,075fb1c6-6c70-4ec6-a439-fcebb76c4e0a,A. MARK,GLICKSTEIN,A. MARK GLICKSTEIN ...,Individual,CA,,PARTNERSHIP HEALTH PLAN OF CA
2,4a3968f5-7f5e-4ed1-8f39-bfc70bc67af8,A. MICHAEL,PALIZZI,A. MICHAEL PALIZZI ...,Individual,MI,,MILLER CANFIELD
3,bb952efc-3dba-4449-9405-ea65202fbbea,AARON,ALDRICH,AARON ALDRICH ...,Individual,MI,,MILLER PIPELINE CORP.
4,79ec4a73-f688-479a-a4e3-0b0a3813188a,AARON,BLAND,AARON BLAND ...,Individual,MI,,
...,...,...,...,...,...,...,...,...
7122,a1a6ff3b-cfa4-4b84-bf8c-20984f9871f0,,,"Trone, Robert",Individual,MN,,
7123,37ab55f5-3613-469c-8b66-ac8888f5bcae,,,"Wark, Mary Ann",Individual,MN,,
7124,92d5ac7c-4702-420c-97a7-656111677f5a,,,"Wenstrom, Gene",Individual,MN,,
7125,fa934bf1-f611-4cd3-9bff-451bdf2e5bd2,,,"Wika, Kevin",Individual,MN,,


In [17]:
a = inds_sample.drop_duplicates()
len(a)

7207

In [3]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'Max Speed': [380., 370., np.nan, np.nan],
                   'Animal': ['None', 'Falcon', 'None', 'Parrot'],
                   'Color':['green',None,'yellow','blue'],
                   'Age':[2,np.nan,5,6]})
df

Unnamed: 0,Max Speed,Animal,Color,Age
0,380.0,,green,2.0
1,370.0,Falcon,,
2,,,yellow,5.0
3,,Parrot,blue,6.0


In [2]:
df= df.groupby(df.columns[1:].tolist(), dropna=False)["Max Speed"]#.agg(list)#.reset_index()
df

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fa2f9d5bb50>

In [4]:
df = df.groupby((df.columns.difference(['Max Speed'])).tolist(),dropna=False)['Max Speed'].agg(list).reset_index()
df

Unnamed: 0,Age,Animal,Color,Max Speed
0,2.0,,green,[380.0]
1,5.0,,yellow,[nan]
2,6.0,Parrot,blue,[nan]
3,,Falcon,,[370.0]


In [24]:
df

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f59594f81d0>