In [80]:
import pandas as pd
import io
from tqdm import tqdm
from itertools import islice
from time import sleep
# Can probably get the census tract based on an api: https://geo.fcc.gov/api/census/#!/area/get_area
# but this is different based on the census year

In [40]:
filepath = '../data/NAD_r11.txt'

In [67]:
first_line = None

with open(filepath) as f:
    first_line = f.readline()

print(first_line)

OID,State,County,Inc_Muni,Uninc_Comm,Nbrhd_Comm,Post_Comm,Zip_Code,Plus_4,Bulk_Zip,Bulk_Plus4,StN_PreMod,StN_PreDir,StN_PreTyp,StN_PreSep,StreetName,StN_PosTyp,StN_PosDir,StN_PosMod,AddNum_Pre,Add_Number,AddNum_Suf,LandmkPart,LandmkName,Building,Floor,Unit,Room,Addtl_Loc,Milepost,Longitude,Latitude,NatGrid_Coord,GUID,Addr_Type,Placement,Source,AddAuth,UniqWithin,LastUpdate,Effective,Expired



In [68]:
total_lines = sum(1 for line in open(filepath))
print('Total number of lines: %s' % total_lines)

Total number of lines: 67357690


In [76]:
import warnings
warnings.filterwarnings('ignore')

In [103]:
step = 1000
dfs = []

with tqdm(total=total_lines-1) as pbar:
    with open(filepath) as f:
        f.readline() # skip first line
        i = 0
        lines = first_line
        for line in f:
            pbar.update(1)
            i += 1
            lines += line
            if i > 1000:
                i = 0
                df = pd.read_csv(io.StringIO(lines), low_memory=False)
                pdf = df[['State', 'County', 'Zip_Code', 'Add_Number', 'StreetName', 'Longitude', 'Latitude']]
                pdf = pdf.astype(str)                
                pdf['Address'] = pdf['Add_Number'] + ' ' + pdf['StreetName'] + ' ' + pdf['State'] + ' ' + pdf['Zip_Code']
                dfs.append(pdf)
                lines = first_line
                

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67357689/67357689 [07:26<00:00, 150884.01it/s]


In [44]:
# Column descriptors: https://www.transportation.gov/sites/dot.gov/files/docs/mission/gis/national-address-database/308816/nad-schema-v1.pdf

In [104]:
len(dfs)

67290

In [105]:
df = pd.concat(dfs)

In [106]:
df

Unnamed: 0,State,County,Zip_Code,Add_Number,StreetName,Longitude,Latitude,Address
0,AZ,Coconino,86336,95,LYNX,-111.771770637,34.8280627940001,95 LYNX AZ 86336
1,AZ,Coconino,86336,33,EAGLE,-111.769395183,34.828455976,33 EAGLE AZ 86336
2,AZ,Coconino,86336,141,VISTA BONITA,-111.777281707,34.828623958,141 VISTA BONITA AZ 86336
3,AZ,Coconino,86336,109,BADGER,-111.775765885,34.8291778600001,109 BADGER AZ 86336
4,AZ,Coconino,86336,30,FAWN,-111.773383133,34.824991696,30 FAWN AZ 86336
...,...,...,...,...,...,...,...,...
996,PA,Westmoreland,,3073,Audrey,-79.57448066349987,40.27125643079627,3073 Audrey PA nan
997,PA,Westmoreland,,3075,Audrey,-79.5743142038811,40.27121039055359,3075 Audrey PA nan
998,PA,Westmoreland,,3079,Audrey,-79.57414191689105,40.27118391184811,3079 Audrey PA nan
999,PA,Westmoreland,,3081,Audrey,-79.57395784490284,40.27119169192235,3081 Audrey PA nan


In [110]:
len(df[df['Zip_Code'] == 'nan']['State'].unique())

33

In [111]:
tdf = df[df['Zip_Code'] != 'nan']
tdf

Unnamed: 0,State,County,Zip_Code,Add_Number,StreetName,Longitude,Latitude,Address
0,AZ,Coconino,86336,95,LYNX,-111.771770637,34.8280627940001,95 LYNX AZ 86336
1,AZ,Coconino,86336,33,EAGLE,-111.769395183,34.828455976,33 EAGLE AZ 86336
2,AZ,Coconino,86336,141,VISTA BONITA,-111.777281707,34.828623958,141 VISTA BONITA AZ 86336
3,AZ,Coconino,86336,109,BADGER,-111.775765885,34.8291778600001,109 BADGER AZ 86336
4,AZ,Coconino,86336,30,FAWN,-111.773383133,34.824991696,30 FAWN AZ 86336
...,...,...,...,...,...,...,...,...
104,PA,Westmoreland,15601.0,605,Farview,-79.52236881714968,40.290077006924825,605 Farview PA 15601.0
105,PA,Westmoreland,15601.0,607,Farview,-79.52241451804144,40.29008212001564,607 Farview PA 15601.0
106,PA,Westmoreland,15601.0,597,Farview,-79.52211734366738,40.290106697381,597 Farview PA 15601.0
553,PA,Westmoreland,15601.0,598,Farview,-79.52401315183624,40.29069422586329,598 Farview PA 15601.0


In [107]:
len(df['State'].unique())

42

In [112]:
len(tdf['State'].unique())

42

In [115]:
tdf.rename(columns={
    'State': 'state', 
    'County': 'county', 
    'Zip_Code': 'zip',
    'Add_Number': 'add_number',
    'StreetName': 'street_name',
    'Longitude': 'longitude',
    'Latitude': 'latitude',
    'Address': 'address',
}, inplace=True)

In [116]:
tdf

Unnamed: 0,state,county,zip,add_number,street_name,longitude,latitude,address
0,AZ,Coconino,86336,95,LYNX,-111.771770637,34.8280627940001,95 LYNX AZ 86336
1,AZ,Coconino,86336,33,EAGLE,-111.769395183,34.828455976,33 EAGLE AZ 86336
2,AZ,Coconino,86336,141,VISTA BONITA,-111.777281707,34.828623958,141 VISTA BONITA AZ 86336
3,AZ,Coconino,86336,109,BADGER,-111.775765885,34.8291778600001,109 BADGER AZ 86336
4,AZ,Coconino,86336,30,FAWN,-111.773383133,34.824991696,30 FAWN AZ 86336
...,...,...,...,...,...,...,...,...
104,PA,Westmoreland,15601.0,605,Farview,-79.52236881714968,40.290077006924825,605 Farview PA 15601.0
105,PA,Westmoreland,15601.0,607,Farview,-79.52241451804144,40.29008212001564,607 Farview PA 15601.0
106,PA,Westmoreland,15601.0,597,Farview,-79.52211734366738,40.290106697381,597 Farview PA 15601.0
553,PA,Westmoreland,15601.0,598,Farview,-79.52401315183624,40.29069422586329,598 Farview PA 15601.0


In [None]:
tdf.to_csv('../data/nad.csv.xz')