# ULAN | check active dates
- 2024-06-07
- Date check on Getty ULAN, data acquired via SPARQL-endpoint
- V. Martens

# Import

## Import packages

In [None]:
# handling jsons
import json
from json.decoder import JSONDecodeError

# spell check module
import language_tool_python

# creating time stamps
# from datetime import datetime
import time

# importing files
import glob
import os

# progress bar
from tqdm.notebook import tqdm

# regex module
import re

# for multi-threading
from concurrent import futures
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp

# data wrangling
import pandas as pd

# back up files
import pickle

# viaf package
from viapy import api

# preferences
# adjust pandas to show all cols
# pd.set_option('display.max_colwidth', None

In [None]:
x = api.ViafAPI().find_person('vincent van gogh')
print(x[1])

In [None]:
x = ViafAPI()
y = x.find_person('vincent')
y


In [None]:
from attrmap import attrmap
from attrmap import AttrMap
import attrmap.utils as au

configs = AttrMap(x[3])
configs_dict = au.todict(configs)

from pprint import pprint
pprint(configs_dict['recordData']['viafID'])
pprint(configs_dict['recordData']['birthDate'])
pprint(configs_dict['recordData']['deathDate'])
# pprint(configs_dict['recordData']['nationalityOfEntity'])
pprint(configs_dict['recordData']['mainHeadings']['data'][1]['text'])

# pprint(configs_dict['recordData'].keys())

## Functions

In [None]:
def load_latest_file(filepath:str) -> str:
    '''
    loads created latest create file from a directory
    args: string with filepath
    returns: latest file from a list of files
    '''

    list_of_files = glob.glob(filepath)
    latest_file = max(list_of_files, key=os.path.getctime)
    print(latest_file)
    
    return latest_file

def create_ulan_weblink(string:str) -> str:
    '''from a lod-url creates a regular ulan webpage link
    args: ulan lod landing page
    returns: regular human readable webpage
    '''
    ulan_regex = re.compile('\d+')
    match_class_object = re.search(ulan_regex, string)
    ulan_id = match_class_object.group(0)
    return f"https://www.getty.edu/vow/ULANFullDisplay?find=&role=&nation=&subjectid={ulan_id}"
    

## Constants

In [None]:
# create back up filename for a pickle
time_stamp = time.strftime('%Y%m%d-%H%M%S')
filename_df_errors = f'{time_stamp}_df_errors_TGN.pickle'

# create back up filename for a pickle
time_stamp = time.strftime('%Y%m%d-%H%M%S')
filename_df_lod_results = f'{time_stamp}_df_lod_results_TGN.pickle'

# create back up filename for a pickle
time_stamp = time.strftime('%Y%m%d-%H%M%S')
filename_excel_export = f'{time_stamp}_found_inconsistencies_TGN.xlsx'

print(f"{filename_df_errors}, {filename_df_lod_results}, {filename_excel_export}")

## Import data

In [None]:
latest_picke_file = load_latest_file('data_dumps/*ulan.pickle')

# Open the file in binary mode
with open(latest_picke_file, 'rb') as file:
      
    # Call load method to deserialze
    df_ulan = pickle.load(file)

df_ulan.shape

# Manipulate data

In [None]:
df_ulan.drop(columns=['death.datatype'], inplace=True)
for col in df_ulan.columns:
    df_ulan.rename(columns={col:col.replace('.value', '')}, inplace=True)
    
df_ulan['web_link'] = df_ulan['ulan_id'].apply(create_ulan_weblink)

df_ulan['nationality'] = df_ulan['nationality'].str.split('(').str[0]
df_ulan['type'] = df_ulan['type'].str.split('(').str[0]

df_ulan['active'] = df_ulan['bio'].str.extract('(\d{2}th-\d{2}th[\s]+centuries|\d{4}\,[\s]+died[\s]+\d{4}|\d{4}-\d{4}|\d{4}-|\d{4})')
df_ulan['active2'] = df_ulan['bio'].str.extract('active(.*\d{2,}[t][h])')
df_ulan['active3'] = df_ulan['bio'].str.extract('(\d{1,2}th[\s]+century)')

df_ulan['active'] = df_ulan['active'].fillna(df_ulan['active2']).fillna(df_ulan['active3']).str.strip()
df_ulan.drop(columns=['active2', 'active3'], inplace=True)

unchecked = df_ulan[(df_ulan['active'].isnull()) & (df_ulan['bio'].str.contains('\d+') == True)]
df_ulan = df_ulan[(df_ulan['active'].notnull())]

In [None]:
df_ulan.head(2)

In [None]:
df_ulan

In [None]:
unchecked.head()

In [None]:
%%time

from tqdm import tqdm

# instance of language_tool_python
tool = language_tool_python.LanguageTool('en-US')

# Start pool
thread_pool = ThreadPoolExecutor(max_workers=cpu_count, thread_name_prefix = 'Thread')

# reate futures
futures = [thread_pool.submit(check_df_row, results_df, 'ScopeNote.value', i) for i in tqdm(range(len(results_df)), total=len(results_df), desc='building futures')]

# submit tasks
results = [future.result() for future in tqdm(futures, total=len(futures), desc='spell check data')]

# # Changed the func to add a df[col], to make it more compatible
# # instance of language_tool_python
# tool = language_tool_python.LanguageTool('en-US')

# # Start pool
# thread_pool = ThreadPoolExecutor(max_workers=cpu_count, thread_name_prefix = 'Thread')

# # reate futures
# futures = [thread_pool.submit(check_df_row, results_df, i) for i in tqdm(range(len(results_df)) , total=len(results_df), desc='building futures')]

# # submit tasks
# results = [future.result() for future in tqdm(futures, total=len(futures), desc='spell check data')]

# results = []
# for future in tqdm(futures, total=len(futures), desc='spell check data'):
#     try:
#         results.append(future.result())
#     except (JSONDecodeError, NameError) as e:
#         print(e)
#         pass

# # close tool
# tool.close()

In [None]:
df_errors = parse_list_of_jsons(results)

In [None]:
df_errors.head(5)

# Back up as a pickle

##  Write

In [None]:
with open(f'data_dumps/{filename_df_errors}', 'wb') as handle:
    pickle.dump(df_errors, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'data_dumps/{filename_df_lod_results}', 'wb') as handle:
    pickle.dump(results_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Load

In [None]:
latest_picke_file = load_latest_file('data_dumps/*lod_results_aat.pickle')

# Open the file in binary mode
with open(latest_picke_file, 'rb') as file:
      
    # Call load method to deserialze
    df_lod_results_backup = pickle.load(file)
    
df_lod_results_backup.shape

In [None]:
latest_picke_file = load_latest_file('data_dumps/*errors_aat.pickle')

# Open the file in binary mode
with open(latest_picke_file, 'rb') as file:
      
    # Call load method to deserialze
    df_error_backup = pickle.load(file)
    
df_error_backup.shape

In [None]:
# creates human readable ulan link, based on lod link
df_error_backup['url_ulan'] = df_error_backup['url'].apply(create_ulan_weblink)

In [None]:
df_error_backup['misspelledWord'].value_counts().tail(50)

In [None]:
# filter out punctuation errors
df_punctuation = df_error_backup[(df_error_backup['ruleId'] == 'UPPERCASE_SENTENCE_START') | 
                              (df_error_backup['ruleId'] == 'COMMA_PARENTHESIS_WHITESPACE') |
                              (df_error_backup['ruleId'] == 'WHITESPACE_RULE')]

df_error_backup = df_error_backup[(df_error_backup['ruleId'] != 'UPPERCASE_SENTENCE_START') &
                                  (df_error_backup['ruleId'] != 'COMMA_PARENTHESIS_WHITESPACE') & 
                                  (df_error_backup['ruleId'] != 'WHITESPACE_RULE')]

# Filter out spelling mistakes

In [None]:
# filter out punctuation errors
df_punctuation = df_error_backup[(df_error_backup['ruleId'] == 'UPPERCASE_SENTENCE_START') | 
                              (df_error_backup['ruleId'] == 'COMMA_PARENTHESIS_WHITESPACE') |
                              (df_error_backup['ruleId'] == 'WHITESPACE_RULE')]

df_error_backup = df_error_backup[(df_error_backup['ruleId'] != 'UPPERCASE_SENTENCE_START') &
                                  (df_error_backup['ruleId'] != 'COMMA_PARENTHESIS_WHITESPACE') & 
                                  (df_error_backup['ruleId'] != 'WHITESPACE_RULE')]

# extract checked errors
df_checked_errors = df_error_backup[(df_error_backup['misspelledWord'] == 'stained glass') | 
                                (df_error_backup['misspelledWord'] == 'Brazillian') | 
                                (df_error_backup['misspelledWord'] == 'in United States') | 
                                (df_error_backup['misspelledWord'] == 'multi-media ') | 
                                (df_error_backup['misspelledWord'] == 'Britsh') | 
                                (df_error_backup['misspelledWord'] == 'architec') | 
                                (df_error_backup['misspelledWord'] == 'Dusseldorf') | 
                                (df_error_backup['misspelledWord'] == 'architecht') | 
                                (df_error_backup['misspelledWord'] == 'lanscape') | 
                                (df_error_backup['misspelledWord'] == 'Ialian') | 
                                (df_error_backup['misspelledWord'] == 'Brazlian') | 
                                (df_error_backup['misspelledWord'] == 'Beligian') | 
                                (df_error_backup['misspelledWord'] == 'deigner') | 
                                (df_error_backup['misspelledWord'] == 'Ameican') | 
                                (df_error_backup['misspelledWord'] == 'archtect') | 
                                (df_error_backup['misspelledWord'] == 'artistan ') | 
                                (df_error_backup['misspelledWord'] == 'comtemporary') | 
                                (df_error_backup['misspelledWord'] == 'cetnury') | 
                                (df_error_backup['misspelledWord'] == 'Neterlandish') | 
                                (df_error_backup['misspelledWord'] == 'aarchitect') | 
                                (df_error_backup['misspelledWord'] == 'Kosovan') | 
                                (df_error_backup['misspelledWord'] == 'Oregan') | 
                                (df_error_backup['misspelledWord'] == 'acitve ') | 
                                (df_error_backup['misspelledWord'] == 'photogapher') | 
                                (df_error_backup['misspelledWord'] == 'Scotish') | 
                                (df_error_backup['misspelledWord'] == 'cenury') | 
                                (df_error_backup['misspelledWord'] == 'German born ') | 
                                (df_error_backup['misspelledWord'] == 'lithpgrapher') | 
                                (df_error_backup['misspelledWord'] == 'eaqrly') | 
                                (df_error_backup['misspelledWord'] == ') | ') | 
                                (df_error_backup['misspelledWord'] == 'dminsitrator') | 
                                (df_error_backup['misspelledWord'] == 'stuccotist') | 
                                (df_error_backup['misspelledWord'] == 'Amerian') | 
                                (df_error_backup['misspelledWord'] == 'U.S') | 
                                (df_error_backup['misspelledWord'] == 'in 1890s') | 
                                (df_error_backup['misspelledWord'] == 'enameller') | 
                                (df_error_backup['misspelledWord'] == 'borm') | 
                                (df_error_backup['misspelledWord'] == 'activeca') | 
                                (df_error_backup['misspelledWord'] == 'terra-cotta') | 
                                (df_error_backup['misspelledWord'] == 'Malasian') | 
                                (df_error_backup['misspelledWord'] == 'Spanis') | 
                                (df_error_backup['misspelledWord'] == 'architet') | 
                                (df_error_backup['misspelledWord'] == 'veduta') | 
                                (df_error_backup['misspelledWord'] == 'painterand') | 
                                (df_error_backup['misspelledWord'] == 'contemporay') | 
                                (df_error_backup['misspelledWord'] == 'scuptor') | 
                                (df_error_backup['misspelledWord'] == 'cenutury') | 
                                (df_error_backup['misspelledWord'] == 'paintier') | 
                                (df_error_backup['misspelledWord'] == 'mother of pearl ') | 
                                (df_error_backup['misspelledWord'] == 'tex') | 
                                (df_error_backup['misspelledWord'] == 'laten') | 
                                (df_error_backup['misspelledWord'] == 'medallist') | 
                                (df_error_backup['misspelledWord'] == 'Vir') | 
                                (df_error_backup['misspelledWord'] == 'baptised') | 
                                (df_error_backup['misspelledWord'] == 'chromolithographer') | 
                                (df_error_backup['misspelledWord'] == 'Ameerican') | 
                                (df_error_backup['misspelledWord'] == 'arist') | 
                                (df_error_backup['misspelledWord'] == 'arist') | 
                                (df_error_backup['misspelledWord'] == 'actove') | 
                                (df_error_backup['misspelledWord'] == 'Japenese') | 
                                (df_error_backup['misspelledWord'] == 'scultptor') | 
                                (df_error_backup['misspelledWord'] == 'during 1940s') | 
                                (df_error_backup['misspelledWord'] == 'Brisith') | 
                                (df_error_backup['misspelledWord'] == 'in 1920s') | 
                                (df_error_backup['misspelledWord'] == 'cermaicist') | 
                                (df_error_backup['misspelledWord'] == 'boatbuillder') | 
                                (df_error_backup['misspelledWord'] == 'nterior') | 
                                (df_error_backup['misspelledWord'] == 'in 1590s') | 
                                (df_error_backup['misspelledWord'] == 'Moroccon') | 
                                (df_error_backup['misspelledWord'] == 'illlustrator') | 
                                (df_error_backup['misspelledWord'] == 'central Europe') | 
                                (df_error_backup['misspelledWord'] == 'Belgan') | 
                                (df_error_backup['misspelledWord'] == 'Piemont') | 
                                (df_error_backup['misspelledWord'] == 'photographert') | 
                                (df_error_backup['misspelledWord'] == 'Fernch') | 
                                (df_error_backup['misspelledWord'] == 'aritst') | 
                                (df_error_backup['misspelledWord'] == 'Austrialian') | 
                                (df_error_backup['misspelledWord'] == '1680\'s ') | 
                                (df_error_backup['misspelledWord'] == 'cenutry') | 
                                (df_error_backup['misspelledWord'] == 'administator') | 
                                (df_error_backup['misspelledWord'] == 'archtiect') | 
                                (df_error_backup['misspelledWord'] == 'drafsman') | 
                                (df_error_backup['misspelledWord'] == 'Canda') | 
                                (df_error_backup['misspelledWord'] == 'M ') | 
                                (df_error_backup['misspelledWord'] == 'late late') | 
                                (df_error_backup['misspelledWord'] == 'jewlery') | 
                                (df_error_backup['misspelledWord'] == 'photolithographer') | 
                                (df_error_backup['misspelledWord'] == 'Chiliean') | 
                                (df_error_backup['misspelledWord'] == '20 century') | 
                                (df_error_backup['misspelledWord'] == 'comteporary') | 
                                (df_error_backup['misspelledWord'] == 'Luxumbourgian') | 
                                (df_error_backup['misspelledWord'] == 'Amercian') | 
                                (df_error_backup['misspelledWord'] == 'horiculturist') | 
                                (df_error_backup['misspelledWord'] == 'Ventian') | 
                                (df_error_backup['misspelledWord'] == 'glass-blower') | 
                                (df_error_backup['misspelledWord'] == 'achitect') | 
                                (df_error_backup['misspelledWord'] == 'Argentinine') | 
                                (df_error_backup['misspelledWord'] == 'lanscapist') | 
                                (df_error_backup['misspelledWord'] == 'Mayasian') | 
                                (df_error_backup['misspelledWord'] == 'armourer') | 
                                (df_error_backup['misspelledWord'] == 'counsellor') | 
                                (df_error_backup['misspelledWord'] == 'Enlgand') | 
                                (df_error_backup['misspelledWord'] == 'phototgrapher') | 
                                (df_error_backup['misspelledWord'] == 'paitner') | 
                                (df_error_backup['misspelledWord'] == 'Hong kong') | 
                                (df_error_backup['misspelledWord'] == 'jeweller') | 
                                (df_error_backup['misspelledWord'] == 'bronzeworker') | 
                                (df_error_backup['misspelledWord'] == 'photograper') | 
                                (df_error_backup['misspelledWord'] == 'actve') | 
                                (df_error_backup['misspelledWord'] == 'dutchess ') | 
                                (df_error_backup['misspelledWord'] == 'Berkely') | 
                                (df_error_backup['misspelledWord'] == 'baptised ') | 
                                (df_error_backup['misspelledWord'] == 'Japansese') | 
                                (df_error_backup['misspelledWord'] == 'scupltor') | 
                                (df_error_backup['misspelledWord'] == 'contempory ') | 
                                (df_error_backup['misspelledWord'] == 'archiect') | 
                                (df_error_backup['misspelledWord'] == 'copyistr') | 
                                (df_error_backup['misspelledWord'] == 'active active') | 
                                (df_error_backup['misspelledWord'] == 'writre') | 
                                (df_error_backup['misspelledWord'] == 'ironsmith') | 
                                (df_error_backup['misspelledWord'] == 'Italian Italian') | 
                                (df_error_backup['misspelledWord'] == 'gardner') | 
                                (df_error_backup['misspelledWord'] == 'Geman') | 
                                (df_error_backup['misspelledWord'] == 'Austalian') | 
                                (df_error_backup['misspelledWord'] == 'paintemaker ') | 
                                (df_error_backup['misspelledWord'] == 'coppersmith') | 
                                (df_error_backup['misspelledWord'] == 'architerct') | 
                                (df_error_backup['misspelledWord'] == 'and and') | 
                                (df_error_backup['misspelledWord'] == 'copyest') | 
                                (df_error_backup['misspelledWord'] == 'evironmental ') | 
                                (df_error_backup['misspelledWord'] == 'articet') | 
                                (df_error_backup['misspelledWord'] == 'landscsape ') | 
                                (df_error_backup['misspelledWord'] == 'BagHdad') | 
                                (df_error_backup['misspelledWord'] == 'Hoston') | 
                                (df_error_backup['misspelledWord'] == 'craftsperson') | 
                                (df_error_backup['misspelledWord'] == 'Austrial') | 
                                (df_error_backup['misspelledWord'] == 'Japanse') | 
                                (df_error_backup['misspelledWord'] == 'eldest') | 
                                (df_error_backup['misspelledWord'] == 'Portugese') | 
                                (df_error_backup['misspelledWord'] == 'intsallation') | 
                                (df_error_backup['misspelledWord'] == 'amatuer') | 
                                (df_error_backup['misspelledWord'] == 'archictect') | 
                                (df_error_backup['misspelledWord'] == 'Britian') | 
                                (df_error_backup['misspelledWord'] == 'draftsmann') | 
                                (df_error_backup['misspelledWord'] == 'tilemaker') | 
                                (df_error_backup['misspelledWord'] == 'scenograper') | 
                                (df_error_backup['misspelledWord'] == 'modelmaker') | 
                                (df_error_backup['misspelledWord'] == 'anf') | 
                                (df_error_backup['misspelledWord'] == 'borrn') | 
                                (df_error_backup['misspelledWord'] == 'photograher') | 
                                (df_error_backup['misspelledWord'] == 'Bazil') | 
                                (df_error_backup['misspelledWord'] == 'metal worker') | 
                                (df_error_backup['misspelledWord'] == 'installlation') | 
                                (df_error_backup['misspelledWord'] == 'photgrapher') | 
                                (df_error_backup['misspelledWord'] == 'garderner ') | 
                                (df_error_backup['misspelledWord'] == 'Nertherlands') | 
                                (df_error_backup['misspelledWord'] == 'draftman') | 
                                (df_error_backup['misspelledWord'] == 'Jamiacan') | 
                                (df_error_backup['misspelledWord'] == 'Michegan') | 
                                (df_error_backup['misspelledWord'] == 'deisgner') | 
                                (df_error_backup['misspelledWord'] == 'Dutch born')]

print(f"punctuation: {df_punctuation.shape}, \n errors_left: {df_error_backup.shape}, \n checked_errors: {df_checked_errors.shape}")

In [None]:
df_holland = results_df[(results_df['bio.value'].str.contains('Holland') == True)]

# creates human readable ulan link, based on lod link
df_holland['url_ulan'] = df_holland['x.value'].apply(create_ulan_weblink)

# the one errouneous holland
df_holland = df_holland[(df_holland['url_ulan'].str.contains('500256837') == True)]

In [None]:
# add cols to allign with other cols
df_holland['ruleId'] = 'PREFERRED SPELLING'
df_holland['message'] = 'Prefered spelling is the Netherlands.'
df_holland['replacements'] = ['The Netherlands']
df_holland['offsetInContext'] = 0
df_holland['context'] = df_holland['bio.value']
df_holland['offset'] = 0
df_holland['errorLength'] = 0
df_holland['category'] = 'PREFERRED SPELLING'
df_holland['ruleIssueType'] = 'typographical'
df_holland['sentence'] = df_holland['bio.value']
df_holland['misspelledWord'] = 'Holland'

# rename cols
df_holland = df_holland.rename(columns={'x.value':'url'})

# select cols
df_holland = df_holland[['url', 'ruleId', 'message', 'replacements', 'offsetInContext',
       'context', 'offset', 'errorLength', 'category', 'ruleIssueType',
       'sentence', 'misspelledWord', 'url_ulan']]

# Merge all found errors and create export

In [None]:
# merge
df_spell_check = pd.concat([df_checked_errors, df_punctuation, df_holland])

# select cols
df_spell_check = df_spell_check[['url', 
                                 'url_ulan', 
                                'sentence',
                                 'ruleId', 
                                 'message', 
                                 'replacements', 
                                 'offsetInContext',
#                                  'context', 
                                 'offset', 
                                 'errorLength', 
                                 'category', 
                                 'ruleIssueType',
                                 'misspelledWord'
                                ]]

# rename cols
df_spell_check = df_spell_check.rename(columns={'url' : 'url_lod'})

# reset index
df_spell_check = df_spell_check.reset_index().drop(columns='index')

# Export

In [None]:
filename_excel_export

In [None]:
df_spell_check.to_excel('data_dumps\\' + filename_excel_export, index=False)

# Check for similar names

In [None]:
# https://stackoverflow.com/questions/52631291/vectorizing-or-speeding-up-fuzzywuzzy-string-matching-on-pandas-column

import pandas as pd 
import numpy as np


# df = pd.DataFrame([['cliftonlarsonallen llp minneapolis MN'],
#         ['loeb and troper llp newyork NY'],
#         ["dauby o'connor and zaleski llc carmel IN"],
#         ['wegner cpas llp madison WI']],
#         columns=['org_name'])

name_vals = results_df['name.value'].to_list()

# name_vals = name_vals[0:10]

threshold = 90

def find_match(x):
    ''''''
    match = process.extract(x, name_vals, limit=2, scorer=fuzz.partial_token_sort_ratio)
#     match = match if match[1] > threshold else np.nan
    return match

# results_df['match_found'] = results_df['name.value'].progress_apply(find_match)

In [None]:
from fuzzywuzzy import process, fuzz

In [None]:
%%timeit

# create list
name_vals = results_df['name.value'].to_list()

name_vals = name_vals[0:5]

#Create tuples of brand names, matched brand names, and the score
score_sort = [(x,) + i
             for x in tqdm(name_vals)
             for i in process.extract(x, name_vals, scorer=fuzz.token_sort_ratio)]

In [None]:
#Create a dataframe from the tuples
df_similarity = pd.DataFrame(score_sort, columns=['artist','match_sort','similarity_score'])

# df_similarity = df_similarity[(df_similarity['score_sort'] > 91) &
#                               (df_similarity['score_sort'] != 100)]

# # create back up filename for a pickle
# time_stamp = time.strftime('%Y%m%d-%H%M%S')
# filename_df_similarity = f'{time_stamp}_df_similarity.xlsx'

# # export
# df_similarity.to_excel('data_dumps\\' + filename_df_similarity, index=False)

# Other options for spell-checks

In [None]:
states = ['IA', 'KS', 'UT', 'VA', 'NC', 'NE', 'SD', 'AL', 'ID', 'FM', 'DE', 'AK', 'CT', 'PR', 'NM', 'MS', 'PW', 'CO', 'NJ', 'FL', 'MN', 'VI', 'NV', 'AZ', 'WI', 'ND', 'PA', 'OK', 'KY', 'RI', 'NH', 'MO', 'ME', 'VT', 'GA', 'GU', 'AS', 'NY', 'CA', 'HI', 'IL', 'TN', 'MA', 'OH', 'MD', 'MI', 'WY', 'WA', 'OR', 'MH', 'SC', 'IN', 'LA', 'MP', 'DC', 'MT', 'AR', 'WV', 'TX']
regex = re.compile(r'\b(' + '|'.join(states) + r')\b', re.IGNORECASE)

states2 = ['I.A.', 'K.S.', 'U.T.', 'V.A.', 'N.C.', 'N.E.', 'S.D.', 'A.L.', 'I.D.', 'F.M.', 'D.E.', 'A.K.', 'C.T.',
           'P.R.', 'N.M.', 'M.S.', 'P.W.', 'C.O.', 'N.J.', 'F.L.', 'M.N.', 'V.I.', 'N.V.', 'A.Z.', 'W.I.', 'N.D.', 
           'P.A.', 'O.K.', 'K.Y.', 'R.I.', 'N.H.', 'M.O.', 'M.E.', 'V.T.', 'G.A.', 'G.U.', 'A.S.', 'N.Y.', 'C.A.', 
           'H.I.', 'I.L.', 'T.N.', 'M.A.', 'O.H.', 'M.D.', 'M.I.', 'W.Y.', 'W.A.', 'O.R.', 'M.H.', 'S.C.', 'I.N.', 
           'L.A.', 'M.P.', 'D.C.', 'M.T.', 'A.R.', 'W.V.', 'T.X.']
regex2 = re.compile(r'\b(' + '|'.join(states) + r')\b', re.IGNORECASE)


In [None]:
def state_finder(string:str) -> list:
    ''''''
    states2 = []
#     states = ['IA', 'KS', 'UT', 'VA', 'NC', 'NE', 'SD', 'AL', 'ID', 'FM', 'DE', 'AK', 'CT', 'PR', 'NM', 'MS', 'PW', 
#               'CO', 'NJ', 'FL', 'MN', 'VI', 'NV', 'AZ', 'WI', 'ND', 'PA', 'OK', 'KY', 'RI', 'NH', 'MO', 'ME', 'VT', 
#               'GA', 'GU', 'AS', 'NY', 'CA', 'HI', 'IL', 'TN', 'MA', 'OH', 'MD', 'MI', 'WY', 'WA', 'OR', 'MH', 'SC', 
#               'IN', 'LA', 'MP', 'DC', 'MT', 'AR', 'WV', 'TX']

    states = ['I.A.', 'K.S.', 'U.T.', 'V.A.', 'N.C.', 'N.E.', 'S.D.', 'A.L.', 'I.D.', 'F.M.', 'D.E.', 'A.K.', 'C.T.',
           'P.R.', 'N.M.', 'M.S.', 'P.W.', 'C.O.', 'N.J.', 'F.L.', 'M.N.', 'V.I.', 'N.V.', 'A.Z.', 'W.I.', 'N.D.', 
           'P.A.', 'O.K.', 'K.Y.', 'R.I.', 'N.H.', 'M.O.', 'M.E.', 'V.T.', 'G.A.', 'G.U.', 'A.S.', 'N.Y.', 'C.A.', 
           'H.I.', 'I.L.', 'T.N.', 'M.A.', 'O.H.', 'M.D.', 'M.I.', 'W.Y.', 'W.A.', 'O.R.', 'M.H.', 'S.C.', 'I.N.', 
           'L.A.', 'M.P.', 'D.C.', 'M.T.', 'A.R.', 'W.V.', 'T.X.']                   
                  
    regex = re.compile(r'\b(' + '|'.join(states) + r')\b', re.IGNORECASE)

    try:
        string = str(string)
        states2 = re.findall(regex , string)
        return states2
    except:
        pass

In [None]:

tqdm.pandas(desc="power DataFrame 1M to 100 random int!")
results_df['test'] = results_df['bio.value'].progress_apply(state_finder)

In [None]:
results_df[(results_df['test'].str.len() > 2)]

In [None]:
results_df[
#     (results_df['bio.value'].str.contains('I.A.') == True) |
    (results_df['bio.value'].str.contains('Calif\.') == True) |
    (results_df['bio.value'].str.contains('CA') == True) |
    (results_df['bio.value'].str.contains('C\.A\.') == True) |
    (results_df['bio.value'].str.contains('California') == True)]
# (results_df['bio.value'].str.contains('K.S.') == True) |
# (results_df['bio.value'].str.contains('U.T.') == True) |
# (results_df['bio.value'].str.contains('V.A.') == True) |
# (results_df['bio.value'].str.contains('N.C.') == True) |
# (results_df['bio.value'].str.contains('N.E.') == True) |
# (results_df['bio.value'].str.contains('S.D.') == True) |
# (results_df['bio.value'].str.contains('A.L.') == True) |
# (results_df['bio.value'].str.contains('I.D.') == True) |
# (results_df['bio.value'].str.contains('F.M.') == True) |
# (results_df['bio.value'].str.contains('D.E.') == True) |
# (results_df['bio.value'].str.contains('A.K.') == True) |
# (results_df['bio.value'].str.contains('C.T.') == True) |
# (results_df['bio.value'].str.contains('P.R.') == True) |
# (results_df['bio.value'].str.contains('N.M.') == True) |
# (results_df['bio.value'].str.contains('M.S.') == True) |
# (results_df['bio.value'].str.contains('P.W.') == True) |
# (results_df['bio.value'].str.contains('C.O.') == True) |
# (results_df['bio.value'].str.contains('N.J.') == True) |
# (results_df['bio.value'].str.contains('F.L.') == True) |
# (results_df['bio.value'].str.contains('M.N.') == True) |
# (results_df['bio.value'].str.contains('V.I.') == True) |
# (results_df['bio.value'].str.contains('N.V.') == True) |
# (results_df['bio.value'].str.contains('A.Z.') == True) |
# (results_df['bio.value'].str.contains('W.I.') == True) |
# (results_df['bio.value'].str.contains('N.D.') == True) |
# (results_df['bio.value'].str.contains('P.A.') == True) |
# (results_df['bio.value'].str.contains('O.K.') == True) |
# (results_df['bio.value'].str.contains('K.Y.') == True) |
# (results_df['bio.value'].str.contains('R.I.') == True) |
# (results_df['bio.value'].str.contains('N.H.') == True) |
# (results_df['bio.value'].str.contains('M.O.') == True) |
# (results_df['bio.value'].str.contains('M.E.') == True) |
# (results_df['bio.value'].str.contains('V.T.') == True) |
# (results_df['bio.value'].str.contains('G.A.') == True) |
# (results_df['bio.value'].str.contains('G.U.') == True) |
# (results_df['bio.value'].str.contains('A.S.') == True) |
# (results_df['bio.value'].str.contains('N.Y.') == True) |
# (results_df['bio.value'].str.contains('C.A.') == True) |
# (results_df['bio.value'].str.contains('H.I.') == True) |
# (results_df['bio.value'].str.contains('I.L.') == True) |
# (results_df['bio.value'].str.contains('T.N.') == True) |
# (results_df['bio.value'].str.contains('M.A.') == True) |
# (results_df['bio.value'].str.contains('O.H.') == True) |
# (results_df['bio.value'].str.contains('M.D.') == True) |
# (results_df['bio.value'].str.contains('M.I.') == True) |
# (results_df['bio.value'].str.contains('W.Y.') == True) |
# (results_df['bio.value'].str.contains('W.A.') == True) |
# (results_df['bio.value'].str.contains('O.R.') == True) |
# (results_df['bio.value'].str.contains('M.H.') == True) |
# (results_df['bio.value'].str.contains('S.C.') == True) |
# (results_df['bio.value'].str.contains('I.N.') == True) |
# (results_df['bio.value'].str.contains('L.A.') == True) |
# (results_df['bio.value'].str.contains('M.P.') == True) |
# (results_df['bio.value'].str.contains('D.C.') == True) |
# (results_df['bio.value'].str.contains('M.T.') == True) |
# (results_df['bio.value'].str.contains('A.R.') == True) |
# (results_df['bio.value'].str.contains('W.V.') == True) |
# (results_df['bio.value'].str.contains('T.X.') == True)]


In [None]:
results_df[
#     (results_df
#             ['bio.value'].str.contains('Alabama') == True) | 
# (results_df['bio.value'].str.contains('Alaska') == True) | 
# (results_df['bio.value'].str.contains('Arizona') == True) | 
# (results_df['bio.value'].str.contains('Arkansas') == True) | 
# (results_df['bio.value'].str.contains('California') == True) | 
# (results_df['bio.value'].str.contains('Colorado') == True) | 
# (results_df['bio.value'].str.contains('Connecticut') == True) | 
# (results_df['bio.value'].str.contains('Delaware') == True) | 
# (results_df['bio.value'].str.contains('Florida') == True) | 
# (results_df['bio.value'].str.contains('Georgia') == True) | 
# (results_df['bio.value'].str.contains('Hawaii') == True) | 
# (results_df['bio.value'].str.contains('Idaho') == True) | 
# (results_df['bio.value'].str.contains('Illinois') == True) | 
# (results_df['bio.value'].str.contains('Indiana') == True) | 
# (results_df['bio.value'].str.contains('Iowa') == True) | 
# (results_df['bio.value'].str.contains('Kansas') == True) | 
# (results_df['bio.value'].str.contains('Kentucky') == True) | 
# (results_df['bio.value'].str.contains('Louisiana') == True) | 
# (results_df['bio.value'].str.contains('Maine') == True) | 
# (results_df['bio.value'].str.contains('Maryland') == True) | 
# (results_df['bio.value'].str.contains('Massachusetts') == True) | 
# (results_df['bio.value'].str.contains('Michigan') == True) | 
# (results_df['bio.value'].str.contains('Minnesota') == True) | 
# (results_df['bio.value'].str.contains('Mississippi') == True) | 
# (results_df['bio.value'].str.contains('Missouri') == True) | 
# (results_df['bio.value'].str.contains('Montana') == True) | 
# (results_df['bio.value'].str.contains('Nebraska') == True) | 
# (results_df['bio.value'].str.contains('Nevada') == True) | 
# (results_df['bio.value'].str.contains('New Hampshire') == True) | 
# (results_df['bio.value'].str.contains('New Jersey') == True) | 
# (results_df['bio.value'].str.contains('New Mexico') == True) | 
# (results_df['bio.value'].str.contains('New York') == True) | 
# (results_df['bio.value'].str.contains('North Carolina') == True) | 
# (results_df['bio.value'].str.contains('North Dakota') == True) | 
# (results_df['bio.value'].str.contains('Ohio') == True) | 
# (results_df['bio.value'].str.contains('Oklahoma') == True) | 
# (results_df['bio.value'].str.contains('Oregon') == True) | 
# (results_df['bio.value'].str.contains('Pennsylvania') == True) | 
# (results_df['bio.value'].str.contains('Rhode Island') == True) | 
# (results_df['bio.value'].str.contains('South Carolina') == True) | 
# (results_df['bio.value'].str.contains('South Dakota') == True) | 
# (results_df['bio.value'].str.contains('Tennessee') == True) | 
# (results_df['bio.value'].str.contains('Texas') == True) | 
# (results_df['bio.value'].str.contains('Utah') == True) | 
# (results_df['bio.value'].str.contains('Vermont') == True) | 
# (results_df['bio.value'].str.contains('Virginia') == True) | 
# (results_df['bio.value'].str.contains('Washington') == True) | 
# (results_df['bio.value'].str.contains('West Virginia') == True) | 
# (results_df['bio.value'].str.contains('Wisconsin') == True) | 
# (results_df['bio.value'].str.contains('Wyoming') == True) | 
# (results_df['bio.value'].str.contains('District of Columbia') == True) | 
# (results_df['bio.value'].str.contains('Guam') == True) | 
# (results_df['bio.value'].str.contains('Marshall Islands') == True) | 
# (results_df['bio.value'].str.contains('Northern Mariana Island') == True) | 
# (results_df['bio.value'].str.contains('Puerto Rico') == True) | 
# (results_df['bio.value'].str.contains('Virgin Islands') == True) | 
# (results_df['bio.value'].str.contains('AL') == True) | 
# (results_df['bio.value'].str.contains('AK') == True) | 
# (results_df['bio.value'].str.contains('AZ') == True) | 
# (results_df['bio.value'].str.contains('AR') == True) | 
# (results_df['bio.value'].str.contains('CA') == True) | 
# (results_df['bio.value'].str.contains('CO') == True) | 
# (results_df['bio.value'].str.contains('CT') == True) | 
# (results_df['bio.value'].str.contains('DE') == True) | 
# (results_df['bio.value'].str.contains('FL') == True) | 
# (results_df['bio.value'].str.contains('GA') == True) | 
# (results_df['bio.value'].str.contains('HI') == True) | 
# (results_df['bio.value'].str.contains('ID') == True) | 
# (results_df['bio.value'].str.contains('IL') == True) | 
# (results_df['bio.value'].str.contains('IN') == True) | 
# (results_df['bio.value'].str.contains('IA') == True) | 
# (results_df['bio.value'].str.contains('KS') == True) | 
# (results_df['bio.value'].str.contains('KY') == True) | 
# (results_df['bio.value'].str.contains('LA') == True) | 
# (results_df['bio.value'].str.contains('ME') == True) | 
# (results_df['bio.value'].str.contains('MD') == True) | 
# (results_df['bio.value'].str.contains('MA') == True) | 
# (results_df['bio.value'].str.contains('MI') == True) | 
# (results_df['bio.value'].str.contains('MN') == True) | 
# (results_df['bio.value'].str.contains('MS') == True) | 
# (results_df['bio.value'].str.contains('MO') == True) | 
# (results_df['bio.value'].str.contains('MT') == True) | 
# (results_df['bio.value'].str.contains('NE') == True) | 
# (results_df['bio.value'].str.contains('NV') == True) | 
# (results_df['bio.value'].str.contains('NH') == True) | 
# (results_df['bio.value'].str.contains('NJ') == True) | 
# (results_df['bio.value'].str.contains('NM') == True) | 
# (results_df['bio.value'].str.contains('NY') == True) | 
# (results_df['bio.value'].str.contains('NC') == True) | 
# (results_df['bio.value'].str.contains('ND') == True) | 
# (results_df['bio.value'].str.contains('OH') == True) | 
# (results_df['bio.value'].str.contains('OK') == True) | 
# (results_df['bio.value'].str.contains('OR') == True) | 
# (results_df['bio.value'].str.contains('PA') == True) | 
# (results_df['bio.value'].str.contains('RI') == True) | 
# (results_df['bio.value'].str.contains('SC') == True) | 
# (results_df['bio.value'].str.contains('SD') == True) | 
# (results_df['bio.value'].str.contains('TN') == True) | 
(results_df['bio.value'].str.contains('TX') == True) | 
(results_df['bio.value'].str.contains('T.X.') == True) | 
# (results_df['bio.value'].str.contains('UT') == True) | 
# (results_df['bio.value'].str.contains('VT') == True) | 
# (results_df['bio.value'].str.contains('VA') == True) | 
# (results_df['bio.value'].str.contains('WA') == True) | 
# (results_df['bio.value'].str.contains('WV') == True) | 
# (results_df['bio.value'].str.contains('WI') == True) | 
# (results_df['bio.value'].str.contains('WY') == True) | 
# (results_df['bio.value'].str.contains('DC') == True) | 
# (results_df['bio.value'].str.contains('GU') == True) | 
# (results_df['bio.value'].str.contains('MH') == True) | 
# (results_df['bio.value'].str.contains('MP') == True) | 
# (results_df['bio.value'].str.contains('PR') == True) | 
# (results_df['bio.value'].str.contains('VI') == True) | 
# (results_df['bio.value'].str.contains(' Ala.') == True) | 
# (results_df['bio.value'].str.contains(' Alaska') == True) | 
# (results_df['bio.value'].str.contains(' Ariz.') == True) | 
# (results_df['bio.value'].str.contains(' Ark.') == True) | 
# (results_df['bio.value'].str.contains(' Calif.') == True) | 
# (results_df['bio.value'].str.contains(' Color.') == True) | 
# (results_df['bio.value'].str.contains(' Conn.') == True) | 
# (results_df['bio.value'].str.contains(' Del.') == True) | 
# (results_df['bio.value'].str.contains(' Fla.') == True) | 
# (results_df['bio.value'].str.contains(' Ga.') == True) | 
# (results_df['bio.value'].str.contains(' Hawaii') == True) | 
# (results_df['bio.value'].str.contains(' Idaho') == True) | 
# (results_df['bio.value'].str.contains(' Ill.') == True) | 
# (results_df['bio.value'].str.contains(' Ind.') == True) | 
# (results_df['bio.value'].str.contains(' Iowa') == True) | 
# (results_df['bio.value'].str.contains(' Kan.') == True) | 
# (results_df['bio.value'].str.contains(' Ky.') == True) | 
# (results_df['bio.value'].str.contains(' La.') == True) | 
# (results_df['bio.value'].str.contains(' Maine') == True) | 
# (results_df['bio.value'].str.contains(' Md.') == True) | 
# (results_df['bio.value'].str.contains(' Mass.') == True) | 
# (results_df['bio.value'].str.contains(' Mich.') == True) | 
# (results_df['bio.value'].str.contains(' Minn.') == True) | 
# (results_df['bio.value'].str.contains(' Miss.') == True) | 
# (results_df['bio.value'].str.contains(' Mo.') == True) | 
# (results_df['bio.value'].str.contains(' Mont.') == True) | 
# (results_df['bio.value'].str.contains(' Neb.') == True) | 
# (results_df['bio.value'].str.contains(' Nev.') == True) | 
# (results_df['bio.value'].str.contains(' N.H.') == True) | 
# (results_df['bio.value'].str.contains(' N.J.') == True) | 
# (results_df['bio.value'].str.contains(' N.M.') == True) | 
# (results_df['bio.value'].str.contains(' N.Y.') == True) | 
# (results_df['bio.value'].str.contains(' N.C.') == True) | 
# (results_df['bio.value'].str.contains(' N.D.') == True) | 
# (results_df['bio.value'].str.contains(' Ohio') == True) | 
# (results_df['bio.value'].str.contains(' Okla.') == True) | 
# (results_df['bio.value'].str.contains(' Ore.') == True) | 
# (results_df['bio.value'].str.contains(' Pa.') == True) | 
# (results_df['bio.value'].str.contains(' R.I.') == True) | 
# (results_df['bio.value'].str.contains(' S.C.') == True) | 
# (results_df['bio.value'].str.contains(' S.Dak.') == True) | 
# (results_df['bio.value'].str.contains(' Tenn.') == True) | 
(results_df['bio.value'].str.contains(' Tex.') == True)  ]
# (results_df['bio.value'].str.contains(' Utah') == True) | 
# (results_df['bio.value'].str.contains(' V.T.') == True) | 
# (results_df['bio.value'].str.contains(' Va.') == True) | 
# (results_df['bio.value'].str.contains(' Wash.') == True) | 
# (results_df['bio.value'].str.contains(' W.Va.') == True) | 
# (results_df['bio.value'].str.contains(' Wis.') == True) | 
# (results_df['bio.value'].str.contains(' Wyo.') == True) | 
# (results_df['bio.value'].str.contains(' D.C.') == True) | 
# (results_df['bio.value'].str.contains(' Guam') == True) | 
# (results_df['bio.value'].str.contains(' M.I.') == True) | 
# (results_df['bio.value'].str.contains(' CNMI') == True) | 
# (results_df['bio.value'].str.contains(' P.R. or PUR') == True) | 
# (results_df['bio.value'].str.contains(' V.I.') == True) ]

In [None]:
results_df['test'] = results_df['bio.value'].str.extract(r'(?!BC|CE)(A-Z)')

In [None]:
results_df[(results_df['test'].notnull())]

In [None]:
results_df[(results_df['bio.value'].str.contains('Tex') == True)].head(2)

In [None]:
results_df.head()

In [None]:
no_upper['misspelledWord'].value_counts()

In [None]:
no_upper['misspelledWord'].iloc[26:27]

In [None]:
df_errors['test'] = df_errors['context'].astype(str).str.extract('([\s]+-[\s]+)')

In [None]:
df_errors[(df_errors['test'].notnull())].head(10)