In [3]:
import json
import os
import re
import time

import bs4
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns

from collections import defaultdict
# import jupyterthemes as jt 
# from jupyterthemes.stylefx import set_nb_theme
# from jupyterthemes import get_themes 

%matplotlib inline
pd.options.display.max_columns = 99
os.chdir("/Users/yenchenchou/Documents/RMDS_YC/RiskScore/RMDS_COVID19_riskgenerator")

In [4]:
class ZipCommunityMapper(object):
    
    def __init__(self):
        self.obj = None
        self.community_ls = list()
        self.zipcode_ls = list()
        self.com_zip_dict = defaultdict(set)
        self.final_dict = dict()
    
       
    def _get_reference_community(self):
        reference_com = []
        community_names = pd.read_csv(
            "data/LA_County_Covid19_CSA_testing_table.csv",
            usecols = ["geo_merge"])
        
        for val in community_names["geo_merge"]:
            new_val = re.sub(
                "(^City.of.|Los Angeles - |Unincorporated - )", "", val)
            reference_com.append(new_val)
        return reference_com
        
        
    def _get_url_connect(self, url):
        try:
            url = url
            self.obj = requests.get(url)
        except:
            if self.obj.status_code >= 400 and self.obj.status_codeatus_code <= 499:
                raise Exception("Client Error")
            elif self.obj.status_code >= 500 and self.obj.status_codeatus_code <= 599:
                raise Exception("Server Error")
            else:
                raise Exception("Not Client or Server Error, please update the code")


    def _get_postal_community(self, url):
        self._get_url_connect(url)
        web_content = bs4.BeautifulSoup(self.obj.text, "html.parser")
        table = web_content.find_all("td")
        for idx in range(len(table)):
            if idx % 2 == 0:
                community = table[idx].text.strip()
                self.community_ls.append(community)
            else:
                zipcode = table[idx].text.split("(")[0].strip()
                self.zipcode_ls.append(zipcode)


    def _clean_community(self):
        reference_com = self._get_reference_community()
        new_community_ls = list()
        for val in self.community_ls:
            v = re.sub(r"(^Los.Angeles.|\(Los Angeles\)|PO Boxes|\/.*)", "", val.strip())
            v = re.sub(r"(^Pasadena.*)", "Pasadena", v)
            v = re.sub(r"(^Alhambra.*)", "Alhambra", v)
            v = re.sub(r"(^Downtown.*)", "Downtown", v)
            v = re.sub(r"(.*Long Beach.*)", "Long Beach", v)
            v = re.sub(r"(Santa Clarita )", "", v)
            v = re.sub(r"(\(|\))", "", v.strip())
            new_community_ls.append(v)

        ls = list()
        for val in new_community_ls:
            for ref in reference_com:
                if ref in val:
                    ls.append(ref)
                    break
            else:
                ls.append("missing")
        self.community_ls = ls

        
    def _clean_postal(self):
        new_zipcode_ls = list()
        for zip_sublist in self.zipcode_ls:
            tmp_list = [int(zipcode.strip()) for zipcode in zip_sublist.split(",")]
            new_zipcode_ls.append(tmp_list)
        self.zipcode_ls = new_zipcode_ls
        
        
    def _init_mapper(self):
        for i in range(len(self.community_ls)):
            community = self.community_ls[i]
            zip_sub_ls = self.zipcode_ls[i]
            for zipcode in zip_sub_ls:
                self.com_zip_dict[zipcode].add(community)

                    
    def _clean_mapper(self):
        map_table = dict()
        for key, val in self.com_zip_dict.items():
            map_table[key] = list(val)
        self.com_zip_dict = map_table
        
        for key, val in map_table.items():
            if "missing" in val and len(val) > 1:
                val.remove("missing")

        self.com_zip_dict = map_table
    
    
    def _correct_mapper(self):
        with open("data/zipcode_correction.json") as json_file:
            correction_data = json.load(json_file)
        
        for key, val in self.com_zip_dict.items():
            try:
                self.final_dict[key] = correction_data[str(key)]
            except:
                self.final_dict[key] = val[0]

    
    def get_mapper(self, url):  
        
        self._get_postal_community(url)
        self._clean_postal()
        self._clean_community()
        self._init_mapper()
        self._clean_mapper()
        self._correct_mapper()
        
        return self.final_dict

In [5]:
url = "http://www.laalmanac.com/communications/cm02_communities.php"
zip_community_mapper = ZipCommunityMapper()
zip_community_mapper._get_postal_community(url)
map_table = zip_community_mapper.get_mapper(url)

In [7]:
with open("data/zipcode_mapper.json", "w") as file:
    json.dump(map_table, file)

In [None]:
for key, val in zip_community_mapper.final_dict.items():
    if "missing" in val and len(val) > 1:
        val.remove("missing")
        print(key, val)

In [None]:
zipcode_correction = {
    90278: "Redondo Beach", 91350: "Santa Clarita", 
    91101: "Pasadena", 91103: "Pasadena", 91104: "Pasadena", 
    91105: "Pasadena", 91106: "Pasadena", 91107: "Pasadena",
    91123: "Pasadena", 91129: "Pasadena", 91125: "Pasadena", 
    91126: "Pasadena", 91199: "Pasadena", 91182: "Pasadena", 
    91189: "Pasadena", 91188: "Pasadena", 91185: "Pasadena", 
    91102: "Pasadena", 91109: "Pasadena", 91114: "Pasadena", 
    91115: "Pasadena", 91116: "Pasadena", 91117: "Pasadena", 
    91124: "Pasadena", 91110: "Pasadena", 91184: "Pasadena",
    91355: "Valencia", 91380: "Santa Clarita", 91385: "Santa Clarita",
    90275: "Rancho Palos Verdes", 90277: "Redondo Beach",
    91744: "La Puente", 91748: "Rowland Heights", 90304: "Lennox",
    93543: "Sun Village", 93550: "Palmdale", 93591: "Lake Los Angeles",
    93586: "Quartz Hill", 91405: "Van Nuys", 90292: "Marina Peninsula",
    91322: "Santa Clarita", 91321: "Santa Clarita", 90274: "Marina del Rey",
    91789: "Walnut", 90017: "South Park", 90604: "Whittier",
    93532: "Lake Hughes", 90005: "Koreatown", 90020: "Hancock Park",
    93535: "Hi Vista", 90069: "West Hollywood", 90255: "Huntington Park",
    90086: "Chinatown", 93551: "Leona Valley", 90220: "Compton",
    90221: "Compton", 90222: "Rosewood", 90224: "Compton",
    90066: "Mar Vista", 93536: "Del Sur", 90601: "Whittier",
    91390: "Agua Dulce", 90704: "Santa Catalina Island", 91746: "Bassett",
    91010: "Duarte", 91702: "Azusa", 91706: "Baldwin Park", 
    90201: "Cudahy", 90270: "Maywood", 91386 : "Canyon Country",
    91351: "Canyon Country", 91303: "Canoga Park", 91386: "Santa Clarita",
    91351: "Santa Clarita", 91387: "Santa Clarita", 91354: "Valencia",
    91331: "Pacoima", 90019: "Mid-City", 90044: "South Los Angeles",
    90039: "Atwater Village", 90008: "Baldwin Hills", 90049: "Brentwood",
    90077: "Bel Air Estates", 91307: "West Hills", 91304:"West Hills",
    90006: "Koreatown", 91302:"Hidden Hills",  90272: "Pacific Palisades", 
    90064: "Rancho Park", 90012:"Chinatown", 90087:"Chinatown", 
    90065: "Glassell Park", 90014: "Downtown Historic Core", 
    900017: "Downtown City West", 90071: "Downtown Bunker Hill",
    90013: "Central", 90015: "South Park", 90026: "Echo Park",
    90032: "El Sereno", 90036: "Melrose", 90001:"Florence-Firestone",
    90247: "Gardena", 90248: "Gardena", 90027: "Los Feliz",
    90004: "Hancock Park", 90501: "Torrance", 90502: "Harbor Gateway",
    90046: "West Hollywood", 90096: "West Hollywood", 
    90043: "Hyde Park", 90018: "Jefferson Park", 91342: "Sylmar",
    91352: "Sun Valley", 91406: "Lake Balboa", 90045: "Westchester",
    90048: "West Hollywood", 91340: "San Fernando", 91601: "Toluca Terrace",
    91602: "Toluca Lake", 91606: "North Hollywood", 91607: "Valley Village",
    91326: "Porter Ranch", 91371: "Woodland Hills", 90731: "San Pedro",
    90732: "Rancho Palos Verdes", 90025: "San Pedro", 91040: "Shadow Hills",
    91403: "Sherman Oaks", 91423:"Sherman Oaks", 91413:"Sherman Oaks",
    90037: "South Los Angeles", 90002: "Watts", 90003: "South Los Angeles",
    90058: "Vernon", 91604: "Studio City", 91610: "Toluca Lake",
    91401: "Valley Glen", 90059: "Willowbrook", 90010: "Wilshire Center"
}

with open("data/zipcode_correction.json", "w") as file:
    json.dump(zipcode_correction, file)