In [1]:
import pandas as pd
import io
from tqdm import tqdm
from time import sleep
import numpy as np

In [2]:
filepath = '../data/NAD_r11.txt' # too large to push onto Git, but is part of the download from here: https://nationaladdressdata.s3.amazonaws.com/NAD_r12_TXT.zip

In [3]:
first_line = None

with open(filepath) as f:
    first_line = f.readline()

print(first_line)

OID,State,County,Inc_Muni,Uninc_Comm,Nbrhd_Comm,Post_Comm,Zip_Code,Plus_4,Bulk_Zip,Bulk_Plus4,StN_PreMod,StN_PreDir,StN_PreTyp,StN_PreSep,StreetName,StN_PosTyp,StN_PosDir,StN_PosMod,AddNum_Pre,Add_Number,AddNum_Suf,LandmkPart,LandmkName,Building,Floor,Unit,Room,Addtl_Loc,Milepost,Longitude,Latitude,NatGrid_Coord,GUID,Addr_Type,Placement,Source,AddAuth,UniqWithin,LastUpdate,Effective,Expired



In [4]:
total_lines = sum(1 for line in open(filepath))
print('Total number of lines: %s' % total_lines)

Total number of lines: 67357690


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
step = 1000
dfs = []

with tqdm(total=total_lines-1) as pbar:
    with open(filepath) as f:
        f.readline() # skip first line
        i = 0
        lines = ''
        for line in f:
            pbar.update(1)
            i += 1
            lines += line
            if i > 1000:

                df = pd.read_csv(io.StringIO(first_line + lines), low_memory=False)
                pdf = df[['State', 'County', 'Zip_Code', 'Add_Number', 'StreetName', 'Longitude', 'Latitude']]
                pdf = pdf.astype(str)                
                pdf['Address'] = pdf['Add_Number'] + ' ' + pdf['StreetName'] + ' ' + pdf['State'] + ' ' + pdf['Zip_Code']
                dfs.append(pdf)
                i = 0
                lines = ''
                

100%|███████████████████████████| 67357689/67357689 [07:08<00:00, 157052.86it/s]


In [7]:
# Column descriptors: https://www.transportation.gov/sites/dot.gov/files/docs/mission/gis/national-address-database/308816/nad-schema-v1.pdf

In [8]:
len(dfs)

67290

In [9]:
df = pd.concat(dfs)

In [10]:
df

Unnamed: 0,State,County,Zip_Code,Add_Number,StreetName,Longitude,Latitude,Address
0,AZ,Coconino,86336,95,LYNX,-111.771770637,34.8280627940001,95 LYNX AZ 86336
1,AZ,Coconino,86336,33,EAGLE,-111.769395183,34.828455976,33 EAGLE AZ 86336
2,AZ,Coconino,86336,141,VISTA BONITA,-111.777281707,34.828623958,141 VISTA BONITA AZ 86336
3,AZ,Coconino,86336,109,BADGER,-111.775765885,34.8291778600001,109 BADGER AZ 86336
4,AZ,Coconino,86336,30,FAWN,-111.773383133,34.824991696,30 FAWN AZ 86336
...,...,...,...,...,...,...,...,...
996,PA,Westmoreland,,3073,Audrey,-79.57448066349987,40.27125643079627,3073 Audrey PA nan
997,PA,Westmoreland,,3075,Audrey,-79.5743142038811,40.27121039055359,3075 Audrey PA nan
998,PA,Westmoreland,,3079,Audrey,-79.57414191689105,40.27118391184811,3079 Audrey PA nan
999,PA,Westmoreland,,3081,Audrey,-79.57395784490284,40.27119169192235,3081 Audrey PA nan


In [11]:
len(df[df['Zip_Code'] == 'nan']['State'].unique())

33

In [19]:
tdf = df[df['Zip_Code'] != 'nan']
tdf

Unnamed: 0,State,County,Zip_Code,Add_Number,StreetName,Longitude,Latitude,Address
0,AZ,Coconino,86336,95,LYNX,-111.771770637,34.8280627940001,95 LYNX AZ 86336
1,AZ,Coconino,86336,33,EAGLE,-111.769395183,34.828455976,33 EAGLE AZ 86336
2,AZ,Coconino,86336,141,VISTA BONITA,-111.777281707,34.828623958,141 VISTA BONITA AZ 86336
3,AZ,Coconino,86336,109,BADGER,-111.775765885,34.8291778600001,109 BADGER AZ 86336
4,AZ,Coconino,86336,30,FAWN,-111.773383133,34.824991696,30 FAWN AZ 86336
...,...,...,...,...,...,...,...,...
104,PA,Westmoreland,15601.0,605,Farview,-79.52236881714968,40.290077006924825,605 Farview PA 15601.0
105,PA,Westmoreland,15601.0,607,Farview,-79.52241451804144,40.29008212001564,607 Farview PA 15601.0
106,PA,Westmoreland,15601.0,597,Farview,-79.52211734366738,40.290106697381,597 Farview PA 15601.0
553,PA,Westmoreland,15601.0,598,Farview,-79.52401315183624,40.29069422586329,598 Farview PA 15601.0


In [26]:
tdf['zip_num'] = tdf['Zip_Code'].apply(lambda x: np.isreal(x)) #

In [29]:
l = []

for zip in tqdm(list(tdf['Zip_Code'])):
    try:
        num = str(int(float(zip)))
    except:
        print(zip)
        num = None
    l.append(num)

  5%|█▍                         | 3456764/62897908 [00:01<00:41, 1416385.87it/s]

IA


  7%|█▊                         | 4361685/62897908 [00:02<00:28, 2024327.34it/s]

5-156


 18%|████▌                     | 11111315/62897908 [00:04<00:18, 2844637.95it/s]

MO
MO
MO
MO
MO


 30%|███████▋                  | 18578491/62897908 [00:07<00:15, 2775249.10it/s]

42718`
9-971
ALBAN
SBURG
42718`


 31%|████████▏                 | 19773362/62897908 [00:07<00:15, 2853055.91it/s]

MD


 33%|████████▌                 | 20653436/62897908 [00:08<00:14, 2824062.37it/s]

RKPT
RKPT
RKPT
RKPT
RKPT
RKPT
RKPT
RKPT
RKPT


 36%|█████████▎                | 22428962/62897908 [00:08<00:14, 2809156.52it/s]

LAMES
TEXAS


 38%|█████████▊                | 23644080/62897908 [00:09<00:13, 2873443.57it/s]

TX


 40%|██████████▍               | 25145571/62897908 [00:09<00:13, 2867491.33it/s]

766HH
766HH
KEENE
KEENE
KEENE
KEENE
MABAN
KAUFM
<Null
TERRE
MABAN
KEMP
78/60
78/60


 41%|██████████▊               | 26037725/62897908 [00:09<00:12, 2953733.63it/s]

TILDE
SUNSE
NOCON
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
AZLE
MILLS
AZLE
AZLE
WHITT
ALEDO
ALEDO
ALEDO
TX


 42%|███████████               | 26628097/62897908 [00:10<00:12, 2866715.07it/s]

TX
GILME
EDGEW


 44%|███████████▍              | 27797773/62897908 [00:10<00:12, 2875303.21it/s]

WINK
WINK
WISE
BOYD
BRIDG
PARAD
PARAD
QUITM
LOVIN
TEXAS


 67%|█████████████████▎        | 41990926/62897908 [00:15<00:07, 2853127.88it/s]

**!)!
Dulce
C


 68%|█████████████████▋        | 42849011/62897908 [00:15<00:07, 2813406.25it/s]

C
NM
NM
NM
87OO6
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL
VABEL


 79%|████████████████████▍     | 49409181/62897908 [00:18<00:04, 2870718.85it/s]

RD
RD
MAPLE
RD
Y
Y
TN
LAKE
ROGER
FIRE
383/5


 80%|████████████████████▊     | 50297545/62897908 [00:18<00:04, 2841668.26it/s]

CHURC
TN
TN
38-06


 81%|█████████████████████▏    | 51183303/62897908 [00:18<00:04, 2869461.28it/s]

38-68
TN
TN
TN
383 5
37-58


 82%|█████████████████████▍    | 51761288/62897908 [00:18<00:03, 2847429.65it/s]

NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
NONE
SOMME
TN
-
-
RD
ST
LOBEL
Q
KINGS
ZIP
37/82
378/2


 83%|█████████████████████▋    | 52358586/62897908 [00:19<00:03, 2897366.91it/s]

385O6
TN
373O3
TN
TN
TN
CLAIB
VILLA
NEW T


 99%|█████████████████████████▋| 62256396/62897908 [00:22<00:00, 2852570.04it/s]

Acme
Acme
Acme
Acme


100%|██████████████████████████| 62897908/62897908 [00:22<00:00, 2756049.59it/s]


OUT
OUT


In [30]:
tdf['Zip_Code'] = l

In [32]:
tdf = tdf[tdf['Zip_Code']!= None] # removing all incorrect zip codes from the NAD

In [33]:
tdf['Address'] = tdf['Add_Number'] + ' ' + tdf['StreetName'] + ' ' + tdf['State'] + ' ' + tdf['Zip_Code']

In [None]:
len(df['State'].unique())

In [None]:
len(tdf['State'].unique())

In [34]:
tdf.rename(columns={
    'State': 'state', 
    'County': 'county', 
    'Zip_Code': 'zip',
    'Add_Number': 'add_number',
    'StreetName': 'street_name',
    'Longitude': 'longitude',
    'Latitude': 'latitude',
    'Address': 'address',
}, inplace=True)

In [36]:
tdf

Unnamed: 0,state,county,zip,add_number,street_name,longitude,latitude,address,zip_num
0,AZ,Coconino,86336,95,LYNX,-111.771770637,34.8280627940001,95 LYNX AZ 86336,False
1,AZ,Coconino,86336,33,EAGLE,-111.769395183,34.828455976,33 EAGLE AZ 86336,False
2,AZ,Coconino,86336,141,VISTA BONITA,-111.777281707,34.828623958,141 VISTA BONITA AZ 86336,False
3,AZ,Coconino,86336,109,BADGER,-111.775765885,34.8291778600001,109 BADGER AZ 86336,False
4,AZ,Coconino,86336,30,FAWN,-111.773383133,34.824991696,30 FAWN AZ 86336,False
...,...,...,...,...,...,...,...,...,...
104,PA,Westmoreland,15601,605,Farview,-79.52236881714968,40.290077006924825,605 Farview PA 15601,False
105,PA,Westmoreland,15601,607,Farview,-79.52241451804144,40.29008212001564,607 Farview PA 15601,False
106,PA,Westmoreland,15601,597,Farview,-79.52211734366738,40.290106697381,597 Farview PA 15601,False
553,PA,Westmoreland,15601,598,Farview,-79.52401315183624,40.29069422586329,598 Farview PA 15601,False


In [2]:
# tdf = pd.read_csv('../data/nad.csv.xz')

In [3]:
del tdf['zip_num']

In [6]:
counties = list(tdf['county'].unique())

In [8]:
for county in tqdm(counties):
    pdf = tdf[tdf['county'] == county]
    state = list(pdf['state'].unique())[0]
    pdf.to_csv('../data/%s_%s.csv.xz' % (state, county), index=False)

100%|█████████████████████████████████████| 1082/1082 [3:03:55<00:00, 10.20s/it]
