In [1]:
import pandas as pd
import numpy as np
from uszipcode import SearchEngine, SimpleZipcode, Zipcode

search = SearchEngine()
zipcode = search.by_zipcode(10030)
zipcode.zipcode # access attributes
zipcode.state

'NY'

In [2]:
zip_columns = ["DC_ZIPCODE"] + ["Monthly_Top_" + str(i) + "_Customer_Zip" for i in range(1, 11)]

### Build a map of zipcodes to state

In [3]:
train = pd.read_csv("data/X_train.csv", index_col=0)

  mask |= (ar1 == a)


In [4]:
test = pd.read_csv("data/X_test.csv", index_col=0)

In [16]:
train.head()[zip_columns]

Unnamed: 0,DC_ZIPCODE,Monthly_Top_1_Customer_Zip,Monthly_Top_2_Customer_Zip,Monthly_Top_3_Customer_Zip,Monthly_Top_4_Customer_Zip,Monthly_Top_5_Customer_Zip,Monthly_Top_6_Customer_Zip,Monthly_Top_7_Customer_Zip,Monthly_Top_8_Customer_Zip,Monthly_Top_9_Customer_Zip,Monthly_Top_10_Customer_Zip
0,11717,11101.0,11756.0,11520.0,11746.0,11358.0,11368.0,11223.0,11203.0,11580.0,11901.0
1,11717,11101.0,11756.0,11520.0,11746.0,11358.0,11223.0,11735.0,11229.0,11368.0,11230.0
2,11717,11101.0,11756.0,11520.0,11746.0,11358.0,11223.0,11735.0,11229.0,11368.0,11230.0
3,11717,11520.0,11101.0,11746.0,11756.0,11358.0,11223.0,11520.0,11229.0,11230.0,10801.0
4,11717,11101.0,11520.0,11746.0,11368.0,11358.0,11756.0,11223.0,11580.0,11229.0,11230.0


In [6]:
zipsets = [set(train[column]) for column in zip_columns] + [set(test[column]) for column in zip_columns]
all_zips = { int(code) for codes in zipsets for code in codes if not np.isnan(code) and 601 <= code <= 99950 }
zip_states = { code: search.by_zipcode(code).state for code in all_zips }

In [7]:
len(zip_states)

2014

In [8]:
data = {'zip': list(zip_states.keys()), 'state': list(zip_states.values())}
zip_df = pd.DataFrame.from_dict(data)

In [11]:
zip_df.to_csv("zip_states.csv")
zip_df

Unnamed: 0,zip,state
0,65536,MO
1,40962,KY
2,32773,FL
3,57350,SD
4,32778,FL
5,24592,VA
6,32796,FL
7,24605,VA
8,32807,FL
9,32809,FL


### Augment the data with states

In [56]:
chunk = pd.read_csv("data/train_augmented_2.csv", index_col=0, nrows=200)
chunk["DC_STATE"] = chunk["DC_ZIPCODE"].astype('float64').map(lambda code: zip_states.get(code) or np.nan)
for column in zip_columns[1:]:
    chunk[column] = chunk[column].astype('float64').map(lambda code: zip_states.get(code) or np.nan)
chunk[zip_columns]

Unnamed: 0,DC_ZIPCODE,Monthly_Top_1_Customer_Zip,Monthly_Top_2_Customer_Zip,Monthly_Top_3_Customer_Zip,Monthly_Top_4_Customer_Zip,Monthly_Top_5_Customer_Zip,Monthly_Top_6_Customer_Zip,Monthly_Top_7_Customer_Zip,Monthly_Top_8_Customer_Zip,Monthly_Top_9_Customer_Zip,Monthly_Top_10_Customer_Zip
0,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
1,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
2,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
3,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
4,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
5,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
6,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
7,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
8,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY
9,11717,NY,NY,NY,NY,NY,NY,NY,NY,NY,NY


In [58]:
out_file = "data/train_augmented_3.csv"
open(out_file, 'w').close()

with open(out_file, 'a') as f:
    writeHeader = True
    for chunk in pd.read_csv("data/train_augmented_2.csv", index_col=0, chunksize=10 ** 5):
        chunk["DC_STATE"] = chunk["DC_ZIPCODE"].astype('float64').map(lambda code: zip_states.get(code) or np.nan)
        for column in zip_columns[1:]:
            chunk[column] = chunk[column].astype('float64').map(lambda code: zip_states.get(code) or np.nan)
        chunk.to_csv(f, header=writeHeader)
        writeHeader = False