In [15]:
import requests
import pandas as pd
import numpy as np

In [6]:
import requests, zipfile, io
url = "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter90.zip"
r = requests.get(url)
r.ok

True

In [7]:
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("App_Data/")

In [8]:
newest_vreg = pd.read_table("App_Data/ncvoter90.txt", encoding='ISO-8859-1')
newest_vreg.head()

Unnamed: 0,county_id,county_desc,voter_reg_num,status_cd,voter_status_desc,reason_cd,voter_status_reason_desc,absent_ind,name_prefx_cd,last_name,...,munic_dist_desc,dist_1_abbrv,dist_1_desc,dist_2_abbrv,dist_2_desc,confidential_ind,birth_year,ncid,vtd_abbrv,vtd_desc
0,90,UNION,346446,A,ACTIVE,AV,VERIFIED,,,AALAEI,...,WAXHAW,30.0,30TH PROSECUTORIAL,,,N,1974,EF230133,020A,020A
1,90,UNION,348004,A,ACTIVE,AV,VERIFIED,,,AALAEI,...,WAXHAW,30.0,30TH PROSECUTORIAL,,,N,1974,EF231096,020A,020A
2,90,UNION,281016,I,INACTIVE,IN,CONFIRMATION NOT RETURNED,,,AALBORG,...,INDIAN TRAIL,30.0,30TH PROSECUTORIAL,,,N,1994,AL261840,029A,029A
3,90,UNION,288489,A,ACTIVE,AV,VERIFIED,,,AALBORG,...,INDIAN TRAIL,30.0,30TH PROSECUTORIAL,,,N,1958,AL178177,029A,029A
4,90,UNION,279308,A,ACTIVE,AV,VERIFIED,,,AALBORG,...,INDIAN TRAIL,30.0,30TH PROSECUTORIAL,,,N,1958,AL193958,029A,029A


In [9]:
newest_vreg['ncid'].nunique()

179675

In [16]:
def get_birth_reg_census(state):
    
    if state in ['AS', 'GU', 'MP', 'PR', 'VI', 'OC']:
        return 'Other'
    
    # the rest of the categories are based on U.S. Census Bureau regions
    elif state in ['CT', 'ME', 'MA', 'NH', 'RI', 'VT',
                     'NJ', 'NY', 'PA']:
        return 'Northeast'
    
    elif state in ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 
                     'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR',
                     'LA', 'OK', 'TX']:
        return 'South'
    
    elif state in ['IL', 'IN', 'MI', 'OH', 'WI',
                     'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']:
        return 'Midwest'
    
    elif state in ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT',
                     'WY', 'AK', 'CA', 'HI', 'OR', 'WA']:
        return 'West'
    
    else:
        return 'Missing'

In [17]:
def get_gen_grp(birth_year):
    
    if birth_year < 1946:
        return 'Greatest-Silent'
    
    elif (birth_year > 1945) & (birth_year < 1965):
        return 'Boomer'
    
    elif (birth_year > 1964) & (birth_year < 1981):
        return 'GenX'
    
    elif (birth_year > 1980) & (birth_year < 1997):
        return 'Millennial'
    
    elif birth_year > 1996:
        return 'GenZ'
    
    else:
        return 'Missing'

In [18]:
def clean_vreg(df):
    ## Recast registr_dt as datetime variable
    df['registr_dt'] = pd.to_datetime(df['registr_dt'])

    ## Fill null values in birth_state with 'Missing'
    df['birth_state'].fillna(value='Missing', inplace=True)

    # Recast drivers_lic for sake of clarity in figures
    df['drivers_lic'] = np.where(df['drivers_lic']=='Y',
                                 'License',
                                 'No License')

    ## Create new column grouping most infrequent party categories (<5% of voters)
      ## into same group as those who are unaffiliated
    df['party_grp'] = np.where(df['party_cd'].isin(['REP', 'DEM']),
                               df['party_cd'].str.title(),
                               'Other')

    ## Create new column grouping most infrequent race categories (<5% of voters)
    df['race_grp'] = np.where(df['race_code'].isin(['W', 'B', 'U']),
                              df['race_code'],
                              'O')
    
    race_grp_map = {'W': 'White',
                    'B': 'Black',
                    'U': 'Undesig.',
                    'O': 'Other'}
    
    df['race_grp'] = df['race_grp'].map(race_grp_map)

    ## Create new column grouping most infrequent cities (<5% of voters)
    df['res_city_desc'].fillna('Missing', inplace=True)
    df['city_grp'] = np.where(df['res_city_desc'].isin(['MONROE', 
                                                        'WAXHAW',
                                                        'INDIAN TRAIL',
                                                        'MATTHEWS',
                                                        'Missing']),
                              df['res_city_desc'].str.title(),
                              'Other')

    ## Create a new column grouping birth_state into U.S. Census regions,
      ## lumping territories and out of country into 'Other'
    df['birth_reg_other'] = df['birth_state'].apply(get_birth_reg_census)

    ## Create a new column grouping birth_year into generations, 
      ## also lumping Silent in with Greatest
    df['gen_grp'] = df['birth_year'].apply(get_gen_grp)

    ## Reformat voter_status_desc labels
    df['voter_status_desc'] = np.where(
        df['voter_status_desc']=='TEMPORARY',
        'Temp',
        df['voter_status_desc'].str.title())

    ## Select only the necessary columns
    cleaned_df = df[['voter_status_desc', 'reason_cd', 'city_grp', 
             'race_grp', 'party_grp', 'gen_grp', 'gender_code', 
             'birth_age', 'birth_reg_other', 'drivers_lic',
             'registr_dt']].copy()
    
    return cleaned_df

In [19]:
cleaned_vreg = clean_vreg(newest_vreg)
cleaned_vreg.head()

Unnamed: 0,voter_status_desc,reason_cd,city_grp,race_grp,party_grp,gen_grp,gender_code,birth_age,birth_reg_other,drivers_lic,registr_dt
0,Active,AV,Waxhaw,White,Other,GenX,M,46,Other,License,2019-04-25
1,Active,AV,Waxhaw,Other,Dem,GenX,F,46,Other,License,2019-06-24
2,Inactive,IN,Indian Trail,White,Other,Millennial,F,27,Midwest,License,2013-11-04
3,Active,AV,Indian Trail,White,Other,Boomer,M,62,South,License,2014-09-17
4,Active,AV,Indian Trail,White,Rep,Boomer,F,62,Northeast,License,2013-07-24


In [20]:
cleaned_vreg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179675 entries, 0 to 179674
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   voter_status_desc  179675 non-null  object        
 1   reason_cd          179675 non-null  object        
 2   city_grp           179675 non-null  object        
 3   race_grp           179675 non-null  object        
 4   party_grp          179675 non-null  object        
 5   gen_grp            179675 non-null  object        
 6   gender_code        179675 non-null  object        
 7   birth_age          179675 non-null  int64         
 8   birth_reg_other    179675 non-null  object        
 9   drivers_lic        179675 non-null  object        
 10  registr_dt         179675 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 15.1+ MB


In [2]:
import schedule
import time

In [4]:
def some_job():
    print("working")
    
schedule.every(10).seconds.do(some_job)

while 1:
    schedule.run_pending()
    time.sleep(1)

working
working
working


KeyboardInterrupt: 