In [1]:
import os
from pathlib import Path

import pandas as pd
import pickle
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from pydantic import BaseModel
from enum import Enum, auto
from tqdm import tqdm

from src.constants import PROJECT_ROOT_PATH
from src.ocr import EngineType
from src.region_processing import process_region_segments_ocr, process_region_segments_checkbox

pd.set_option('display.max_rows', 20)

In [2]:
class SegmentType(str, Enum):
    OCR = "ocr"
    CHECKBOX = "checkbox"


class ProcessSegmentParams(BaseModel):
    base_path: Path
    segment_type: SegmentType
    skip_folders: list[str]
    default_fillna: str = 'SCAN ERROR / NO PREDICTION'


In [3]:
all_data_path = PROJECT_ROOT_PATH / "data/WPA_Bunker_Hill_Curated.xlsx"
all_data = pd.read_excel(all_data_path)

In [4]:
classifier = PROJECT_ROOT_PATH / 'static/checkbox_model_2.pkl'

segment_params = [
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/heating", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/race", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/rent", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/roomers", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/business_units", segment_type=SegmentType.CHECKBOX, skip_folders=["num"], default_fillna="yes"),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/business_units", segment_type=SegmentType.OCR, skip_folders=["none"], default_fillna="none"),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/basement", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/bath", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/condition", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/converted", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/cooking", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/duration", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/encumbrance", segment_type=SegmentType.CHECKBOX, skip_folders=[], default_fillna=""),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/ext_material", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/extra_families", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/garage", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/included_in_rent", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/lighting", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/major_structures", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/number_and_age", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/occupancy", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    # ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/office", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/refrig", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/stories", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/structure", segment_type=SegmentType.CHECKBOX, skip_folders=[], default_fillna="other"),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/structure_text", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/toilets", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/total_rooms", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/value", segment_type=SegmentType.OCR, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/water", segment_type=SegmentType.CHECKBOX, skip_folders=[]),
    ProcessSegmentParams(base_path=PROJECT_ROOT_PATH / "data/06_Segments/year_built", segment_type=SegmentType.OCR, skip_folders=[]),
]

In [5]:
def merge_into_all_data_and_save(all_data, df):
    df.index = df.index.astype(str)
    extracted_ids = df.index.str.extract('_(\d+)\.\w+$')[0]
    match_id_series = pd.Series(data=extracted_ids.values, index=df.index)
    match_id_series.dropna(inplace=True)
    df['match_ids'] = match_id_series.astype(int)
    
    print(df.head())

    merged_df = pd.merge(all_data, df, on='match_ids', how='left')
    return merged_df


extracted_ids = all_data['CONTENTdm file path'].str.extract('\/(\d+)\.jp2$')[0]
match_id_series = pd.Series(data=extracted_ids.values, index=all_data.index)
match_id_series.dropna(inplace=True)
all_data['match_ids'] = match_id_series.astype(int)

segment_results = {}
for params in tqdm(segment_params):
    if params.segment_type == SegmentType.CHECKBOX:
        result_df = process_region_segments_checkbox(params.base_path, classifier, skip_folders=params.skip_folders, default_fillna=params.default_fillna)
    elif params.segment_type == SegmentType.OCR:
        result_df = process_region_segments_ocr(params.base_path, EngineType.PADDLEOCR, skip_folders=params.skip_folders, default_fillna=params.default_fillna)
    segment_results[params.base_path.name] = result_df
    print(f"{params.base_path.name} segment processed data head:")
    print(result_df.head())

    all_data = merge_into_all_data_and_save(all_data, result_df)
    
    output_path = PROJECT_ROOT_PATH / f"data/WPA_Bunker_Hill_Curated_Updated.xlsx"
    all_data.to_excel(output_path, index=False)
    print(f"Data saved to {output_path}")

all_data.drop(columns=['match_ids'], inplace=True)
output_path = PROJECT_ROOT_PATH / f"data/WPA_Bunker_Hill_Curated_Final.xlsx"
all_data.to_excel(output_path, index=False)
print(f"Final data saved to {output_path}")

    


  0%|          | 0/30 [00:00<?, ?it/s]

Processing images: 100%|██████████| 8716/8716 [00:06<00:00, 1352.99it/s]


heating segment processed data head:
                                           heating
p15799coll8_75836.jpeg     cent_steam_or_hot_water
p15799coll8_79403.jpeg                        none
p15799coll8_13025.jpeg                        none
p15799coll8_92298.jpeg     cent_steam_or_hot_water
p15799coll8_44403.jpg   SCAN ERROR / NO PREDICTION
                                           heating  match_ids
p15799coll8_75836.jpeg     cent_steam_or_hot_water      75836
p15799coll8_79403.jpeg                        none      79403
p15799coll8_13025.jpeg                        none      13025
p15799coll8_92298.jpeg     cent_steam_or_hot_water      92298
p15799coll8_44403.jpg   SCAN ERROR / NO PREDICTION      44403


  3%|▎         | 1/30 [00:10<05:17, 10.95s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 10697/10697 [00:06<00:00, 1677.35it/s]


race segment processed data head:
                         race
p15799coll8_3710.jpeg   white
p15799coll8_79403.jpeg  white
p15799coll8_13025.jpeg  white
p15799coll8_92298.jpeg  white
p15799coll8_44403.jpg   white
                         race  match_ids
p15799coll8_3710.jpeg   white       3710
p15799coll8_79403.jpeg  white      79403
p15799coll8_13025.jpeg  white      13025
p15799coll8_92298.jpeg  white      92298
p15799coll8_44403.jpg   white      44403


  7%|▋         | 2/30 [00:22<05:10, 11.11s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [08:32<00:00,  4.25it/s]


rent segment processed data head:
                       rent.num
p15799coll8_30529.jpeg       23
p15799coll8_75836.jpeg      100
p15799coll8_14426.jpeg      900
p15799coll8_3710.jpeg      4500
p15799coll8_49866.jpg      1700
                       rent.num  match_ids
p15799coll8_30529.jpeg       23      30529
p15799coll8_75836.jpeg      100      75836
p15799coll8_14426.jpeg      900      14426
p15799coll8_3710.jpeg      4500       3710
p15799coll8_49866.jpg      1700      49866


 10%|█         | 3/30 [08:58<1:48:52, 241.95s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [09:00<00:00,  4.03it/s]


roomers segment processed data head:
                       roomers.roomers
p15799coll8_14426.jpeg              -0
p15799coll8_75836.jpeg               0
p15799coll8_3710.jpeg               -0
p15799coll8_49866.jpg                0
p15799coll8_3655.jpeg            MULES
                       roomers.roomers  match_ids
p15799coll8_14426.jpeg              -0      14426
p15799coll8_75836.jpeg               0      75836
p15799coll8_3710.jpeg               -0       3710
p15799coll8_49866.jpg                0      49866
p15799coll8_3655.jpeg            MULES       3655


 13%|█▎        | 4/30 [18:03<2:36:38, 361.48s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 2179/2179 [00:05<00:00, 426.35it/s] 


business_units segment processed data head:
                       business_units
p15799coll8_49866.jpg            none
p15799coll8_75836.jpeg           none
p15799coll8_3655.jpeg             yes
p15799coll8_30529.jpeg           none
p15799coll8_75973.jpeg           none
                       business_units  match_ids
p15799coll8_49866.jpg            none      49866
p15799coll8_75836.jpeg           none      75836
p15799coll8_3655.jpeg             yes       3655
p15799coll8_30529.jpeg           none      30529
p15799coll8_75973.jpeg           none      75973


 17%|█▋        | 5/30 [18:12<1:37:43, 234.53s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [08:59<00:00,  4.04it/s]


business_units segment processed data head:
                       business_units.num
p15799coll8_3710.jpeg                   -
p15799coll8_14426.jpeg                  -
p15799coll8_75836.jpeg                  -
p15799coll8_30529.jpeg                  -
p15799coll8_3655.jpeg                   -
                       business_units.num  match_ids
p15799coll8_3710.jpeg                   -       3710
p15799coll8_14426.jpeg                  -      14426
p15799coll8_75836.jpeg                  -      75836
p15799coll8_30529.jpeg                  -      30529
p15799coll8_3655.jpeg                   -       3655


 20%|██        | 6/30 [27:17<2:15:55, 339.83s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 4358/4358 [00:05<00:00, 731.16it/s] 


basement segment processed data head:
                       basement
p15799coll8_30529.jpeg       no
p15799coll8_75973.jpeg       no
p15799coll8_14426.jpeg       no
p15799coll8_87649.jpeg       no
p15799coll8_75836.jpeg       no
                       basement  match_ids
p15799coll8_30529.jpeg       no      30529
p15799coll8_75973.jpeg       no      75973
p15799coll8_14426.jpeg       no      14426
p15799coll8_87649.jpeg       no      87649
p15799coll8_75836.jpeg       no      75836


 23%|██▎       | 7/30 [27:27<1:29:00, 232.21s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [08:53<00:00,  4.08it/s]


bath segment processed data head:
                       bath.num
p15799coll8_75836.jpeg       13
p15799coll8_14426.jpeg        4
p15799coll8_87649.jpeg      1it
p15799coll8_3655.jpeg         =
p15799coll8_49866.jpg         L
                       bath.num  match_ids
p15799coll8_75836.jpeg       13      75836
p15799coll8_14426.jpeg        4      14426
p15799coll8_87649.jpeg      1it      87649
p15799coll8_3655.jpeg         =       3655
p15799coll8_49866.jpg         L      49866


 27%|██▋       | 8/30 [36:26<2:00:51, 329.63s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 10895/10895 [00:09<00:00, 1147.21it/s]


condition segment processed data head:
                                condition
p15799coll8_30529.jpeg       major_repair
p15799coll8_75973.jpeg       minor_repair
p15799coll8_3655.jpeg   good;minor_repair
p15799coll8_87649.jpeg               good
p15799coll8_75836.jpeg               good
                                condition  match_ids
p15799coll8_30529.jpeg       major_repair      30529
p15799coll8_75973.jpeg       minor_repair      75973
p15799coll8_3655.jpeg   good;minor_repair       3655
p15799coll8_87649.jpeg               good      87649
p15799coll8_75836.jpeg               good      75836


 30%|███       | 9/30 [36:40<1:20:53, 231.12s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 4351/4351 [18:31<00:00,  3.92it/s]


converted segment processed data head:
                       converted.orig_type converted.year
p15799coll8_3710.jpeg                    -              -
p15799coll8_49866.jpg                    7           1935
p15799coll8_3655.jpeg                    -              -
p15799coll8_13025.jpeg                   2            193
p15799coll8_12634.jpeg                   L           1910
                       converted.orig_type converted.year  match_ids
p15799coll8_3710.jpeg                    -              -       3710
p15799coll8_49866.jpg                    7           1935      49866
p15799coll8_3655.jpeg                    -              -       3655
p15799coll8_13025.jpeg                   2            193      13025
p15799coll8_12634.jpeg                   L           1910      12634


 33%|███▎      | 10/30 [55:16<2:48:07, 504.38s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 8653/8653 [00:07<00:00, 1124.60it/s]


cooking segment processed data head:
                                   cooking
p15799coll8_87649.jpeg                 gas
p15799coll8_3710.jpeg   none_installed;gas
p15799coll8_14426.jpeg                 gas
p15799coll8_75973.jpeg                 gas
p15799coll8_79403.jpeg      none_installed
                                   cooking  match_ids
p15799coll8_87649.jpeg                 gas      87649
p15799coll8_3710.jpeg   none_installed;gas       3710
p15799coll8_14426.jpeg                 gas      14426
p15799coll8_75973.jpeg                 gas      75973
p15799coll8_79403.jpeg      none_installed      79403


 37%|███▋      | 11/30 [55:29<1:52:04, 353.92s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 8716/8716 [39:28<00:00,  3.68it/s]


duration segment processed data head:
                       duration.vacancy_mos duration.vacancy_yrs  \
p15799coll8_75836.jpeg                    -                    -   
p15799coll8_3710.jpeg                     -                    -   
p15799coll8_14426.jpeg                    -                    -   
p15799coll8_3655.jpeg                     -               Y RENT   
p15799coll8_79403.jpeg                    -                    -   

                                duration.lived_yrs duration.lived_mos  
p15799coll8_75836.jpeg  SCAN ERROR / NO PREDICTION                  2  
p15799coll8_3710.jpeg                            L                  6  
p15799coll8_14426.jpeg                           L                  C  
p15799coll8_3655.jpeg                        not v                 cy  
p15799coll8_79403.jpeg  SCAN ERROR / NO PREDICTION                  b  
                       duration.vacancy_mos duration.vacancy_yrs  \
p15799coll8_75836.jpeg                    -          

 40%|████      | 12/30 [1:35:03<4:50:33, 968.54s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 4358/4358 [00:05<00:00, 746.36it/s] 


encumbrance segment processed data head:
                           encumbrance
p15799coll8_49866.jpg                 
p15799coll8_3655.jpeg   no_encumbrance
p15799coll8_75973.jpeg                
p15799coll8_87649.jpeg                
p15799coll8_75836.jpeg                
                           encumbrance  match_ids
p15799coll8_49866.jpg                       49866
p15799coll8_3655.jpeg   no_encumbrance       3655
p15799coll8_75973.jpeg                      75973
p15799coll8_87649.jpeg                      87649
p15799coll8_75836.jpeg                      75836


 43%|████▎     | 13/30 [1:35:14<3:12:13, 678.42s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 10895/10895 [00:08<00:00, 1284.49it/s]


ext_material segment processed data head:
                       ext_material
p15799coll8_75836.jpeg        brick
p15799coll8_79403.jpeg         wood
p15799coll8_13025.jpeg         wood
p15799coll8_92298.jpeg        brick
p15799coll8_3710.jpeg         brick
                       ext_material  match_ids
p15799coll8_75836.jpeg        brick      75836
p15799coll8_79403.jpeg         wood      79403
p15799coll8_13025.jpeg         wood      13025
p15799coll8_92298.jpeg        brick      92298
p15799coll8_3710.jpeg         brick       3710


 47%|████▋     | 14/30 [1:35:28<2:07:23, 477.73s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 4319/4319 [18:08<00:00,  3.97it/s]


extra_families segment processed data head:
                       extra_families.num_fam  extra_families.num_persons
p15799coll8_14426.jpeg                      0  SCAN ERROR / NO PREDICTION
p15799coll8_75836.jpeg                      0                           -
p15799coll8_3710.jpeg                       0  SCAN ERROR / NO PREDICTION
p15799coll8_30529.jpeg                      O  SCAN ERROR / NO PREDICTION
p15799coll8_3655.jpeg                       -  SCAN ERROR / NO PREDICTION
                       extra_families.num_fam  extra_families.num_persons  \
p15799coll8_14426.jpeg                      0  SCAN ERROR / NO PREDICTION   
p15799coll8_75836.jpeg                      0                           -   
p15799coll8_3710.jpeg                       0  SCAN ERROR / NO PREDICTION   
p15799coll8_30529.jpeg                      O  SCAN ERROR / NO PREDICTION   
p15799coll8_3655.jpeg                       -  SCAN ERROR / NO PREDICTION   

                        match_ids  
p15799coll8_1

 50%|█████     | 15/30 [1:53:42<2:45:52, 663.50s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 4312/4312 [00:05<00:00, 722.72it/s] 


garage segment processed data head:
                       garage
p15799coll8_3710.jpeg      no
p15799coll8_14426.jpeg     no
p15799coll8_75973.jpeg     no
p15799coll8_49866.jpg      no
p15799coll8_30529.jpeg     no
                       garage  match_ids
p15799coll8_3710.jpeg      no       3710
p15799coll8_14426.jpeg     no      14426
p15799coll8_75973.jpeg     no      75973
p15799coll8_49866.jpg      no      49866
p15799coll8_30529.jpeg     no      30529


 53%|█████▎    | 16/30 [1:53:53<1:49:00, 467.16s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 17425/17425 [00:10<00:00, 1599.92it/s]


included_in_rent segment processed data head:
                                                         included_in_rent
p15799coll8_3710.jpeg   hot_water;heat;refrig_fuel;refrig;furniture;co...
p15799coll8_79403.jpeg                     hot_water;heat;furniture;light
p15799coll8_13025.jpeg           hot_water;heat;furniture;cook_fuel;light
p15799coll8_92298.jpeg  hot_water;heat;refrig_fuel;refrig;furniture;co...
p15799coll8_44403.jpg                                 furniture;cook_fuel
                                                         included_in_rent  \
p15799coll8_3710.jpeg   hot_water;heat;refrig_fuel;refrig;furniture;co...   
p15799coll8_79403.jpeg                     hot_water;heat;furniture;light   
p15799coll8_13025.jpeg           hot_water;heat;furniture;cook_fuel;light   
p15799coll8_92298.jpeg  hot_water;heat;refrig_fuel;refrig;furniture;co...   
p15799coll8_44403.jpg                                 furniture;cook_fuel   

                        match_ids  
p15799coll8

 57%|█████▋    | 17/30 [1:54:11<1:11:54, 331.87s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 6537/6537 [00:05<00:00, 1128.37it/s]


lighting segment processed data head:
                        lighting
p15799coll8_14426.jpeg  electric
p15799coll8_79403.jpeg  electric
p15799coll8_13025.jpeg  electric
p15799coll8_92298.jpeg  electric
p15799coll8_75836.jpeg  electric
                        lighting  match_ids
p15799coll8_14426.jpeg  electric      14426
p15799coll8_79403.jpeg  electric      79403
p15799coll8_13025.jpeg  electric      13025
p15799coll8_92298.jpeg  electric      92298
p15799coll8_75836.jpeg  electric      75836


 60%|██████    | 18/30 [1:54:22<47:07, 235.64s/it]  

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [08:56<00:00,  4.06it/s]


major_structures segment processed data head:
                       major_structures.num
p15799coll8_75836.jpeg                    -
p15799coll8_14426.jpeg                    -
p15799coll8_30529.jpeg                    -
p15799coll8_49866.jpg                     -
p15799coll8_87649.jpeg                    -
                       major_structures.num  match_ids
p15799coll8_75836.jpeg                    -      75836
p15799coll8_14426.jpeg                    -      14426
p15799coll8_30529.jpeg                    -      30529
p15799coll8_49866.jpg                     -      49866
p15799coll8_87649.jpeg                    -      87649


 63%|██████▎   | 19/30 [2:03:25<1:00:06, 327.82s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 17296/17296 [1:31:48<00:00,  3.14it/s]


number_and_age segment processed data head:
                       number_and_age.20_64 number_and_age.5_9  \
p15799coll8_3710.jpeg                     3                  -   
p15799coll8_14426.jpeg                    -                  -   
p15799coll8_75836.jpeg                    O                  -   
p15799coll8_49866.jpg                     2                  -   
p15799coll8_3655.jpeg                     -                  -   

                       number_and_age.65_over      number_and_age.under_1  \
p15799coll8_3710.jpeg                       -  SCAN ERROR / NO PREDICTION   
p15799coll8_14426.jpeg                      -  SCAN ERROR / NO PREDICTION   
p15799coll8_75836.jpeg                      -  SCAN ERROR / NO PREDICTION   
p15799coll8_49866.jpg                       -  SCAN ERROR / NO PREDICTION   
p15799coll8_3655.jpeg                      LC                           -   

                              number_and_age.10_14  \
p15799coll8_3710.jpeg                     

 67%|██████▋   | 20/30 [3:35:22<5:14:18, 1885.84s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 6537/6537 [00:07<00:00, 931.82it/s] 


occupancy segment processed data head:
                           occupancy
p15799coll8_3655.jpeg   owner;vacant
p15799coll8_75836.jpeg        tenant
p15799coll8_87649.jpeg        tenant
p15799coll8_30529.jpeg        tenant
p15799coll8_14426.jpeg        tenant
                           occupancy  match_ids
p15799coll8_3655.jpeg   owner;vacant       3655
p15799coll8_75836.jpeg        tenant      75836
p15799coll8_87649.jpeg        tenant      87649
p15799coll8_30529.jpeg        tenant      30529
p15799coll8_14426.jpeg        tenant      14426


 70%|███████   | 21/30 [3:35:35<3:18:33, 1323.67s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 8543/8543 [00:06<00:00, 1248.20it/s]


refrig segment processed data head:
                                            refrig
p15799coll8_3710.jpeg                     electric
p15799coll8_79403.jpeg                        none
p15799coll8_13025.jpeg                        none
p15799coll8_92298.jpeg                    electric
p15799coll8_44403.jpg   SCAN ERROR / NO PREDICTION
                                            refrig  match_ids
p15799coll8_3710.jpeg                     electric       3710
p15799coll8_79403.jpeg                        none      79403
p15799coll8_13025.jpeg                        none      13025
p15799coll8_92298.jpeg                    electric      92298
p15799coll8_44403.jpg   SCAN ERROR / NO PREDICTION      44403


 73%|███████▎  | 22/30 [3:35:48<2:04:02, 930.33s/it] 

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [09:04<00:00,  4.00it/s]


stories segment processed data head:
                       stories.num
p15799coll8_3710.jpeg            -
p15799coll8_30529.jpeg           4
p15799coll8_14426.jpeg          -3
p15799coll8_49866.jpg            3
p15799coll8_3655.jpeg            b
                       stories.num  match_ids
p15799coll8_3710.jpeg            -       3710
p15799coll8_30529.jpeg           4      30529
p15799coll8_14426.jpeg          -3      14426
p15799coll8_49866.jpg            3      49866
p15799coll8_3655.jpeg            b       3655


 77%|███████▋  | 23/30 [3:44:59<1:35:15, 816.51s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 13074/13074 [00:09<00:00, 1424.80it/s]


structure segment processed data head:
                                                                structure
p15799coll8_75836.jpeg                                              other
p15799coll8_30529.jpeg                                              other
p15799coll8_14426.jpeg                                         two_decker
p15799coll8_75973.jpeg                                              other
p15799coll8_3655.jpeg   two_side;single_detached;single_attached;four_...
                                                                structure  \
p15799coll8_75836.jpeg                                              other   
p15799coll8_30529.jpeg                                              other   
p15799coll8_14426.jpeg                                         two_decker   
p15799coll8_75973.jpeg                                              other   
p15799coll8_3655.jpeg   two_side;single_detached;single_attached;four_...   

                        match_ids  
p15799coll8_75836.

 80%|████████  | 24/30 [3:45:15<57:37, 576.23s/it]  

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 10895/10895 [51:46<00:00,  3.51it/s]


structure_text segment processed data head:
                       structure_text.partially_converted  \
p15799coll8_30529.jpeg                                  -   
p15799coll8_3710.jpeg                                   -   
p15799coll8_75836.jpeg                                  -   
p15799coll8_3655.jpeg                                   -   
p15799coll8_87649.jpeg                                  -   

                       structure_text.business_with_dwel  \
p15799coll8_30529.jpeg                                 -   
p15799coll8_3710.jpeg         SCAN ERROR / NO PREDICTION   
p15799coll8_75836.jpeg        SCAN ERROR / NO PREDICTION   
p15799coll8_3655.jpeg                                  -   
p15799coll8_87649.jpeg                                 -   

                       structure_text.non_converted  \
p15799coll8_30529.jpeg                            -   
p15799coll8_3710.jpeg    SCAN ERROR / NO PREDICTION   
p15799coll8_75836.jpeg   SCAN ERROR / NO PREDICTION   
p15799co

 83%|████████▎ | 25/30 [4:37:09<1:51:28, 1337.66s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [09:01<00:00,  4.02it/s]


toilets segment processed data head:
                       toilets.num
p15799coll8_3710.jpeg           CL
p15799coll8_75836.jpeg        E1/3
p15799coll8_14426.jpeg         Fy4
p15799coll8_49866.jpg           CL
p15799coll8_3655.jpeg            1
                       toilets.num  match_ids
p15799coll8_3710.jpeg           CL       3710
p15799coll8_75836.jpeg        E1/3      75836
p15799coll8_14426.jpeg         Fy4      14426
p15799coll8_49866.jpg           CL      49866
p15799coll8_3655.jpeg            1       3655


 87%|████████▋ | 26/30 [4:46:16<1:13:22, 1100.65s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2170/2170 [09:02<00:00,  4.00it/s]


total_rooms segment processed data head:
                       total_rooms.num
p15799coll8_3710.jpeg                3
p15799coll8_30529.jpeg              12
p15799coll8_75973.jpeg               -
p15799coll8_49866.jpg                2
p15799coll8_87649.jpeg               T
                       total_rooms.num  match_ids
p15799coll8_3710.jpeg                3       3710
p15799coll8_30529.jpeg              12      30529
p15799coll8_75973.jpeg               -      75973
p15799coll8_49866.jpg                2      49866
p15799coll8_87649.jpeg               T      87649


 90%|█████████ | 27/30 [4:55:26<46:45, 935.20s/it]   

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [09:03<00:00,  4.01it/s]


value segment processed data head:
                       value.num
p15799coll8_50386.jpeg      6000
p15799coll8_3317.jpeg          -
p15799coll8_75924.jpeg    H0.000
p15799coll8_82054.jpg     12.000
p15799coll8_87864.jpeg         -
                       value.num  match_ids
p15799coll8_50386.jpeg      6000      50386
p15799coll8_3317.jpeg          -       3317
p15799coll8_75924.jpeg    H0.000      75924
p15799coll8_82054.jpg     12.000      82054
p15799coll8_87864.jpeg         -      87864


 93%|█████████▎| 28/30 [5:04:36<27:19, 819.79s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing images: 100%|██████████| 6537/6537 [00:07<00:00, 929.98it/s] 


water segment processed data head:
                                             water
p15799coll8_75836.jpeg                hot_and_cold
p15799coll8_3710.jpeg                 hot_and_cold
p15799coll8_49866.jpg                 hot_and_cold
p15799coll8_87649.jpeg                hot_and_cold
p15799coll8_3655.jpeg   SCAN ERROR / NO PREDICTION
                                             water  match_ids
p15799coll8_75836.jpeg                hot_and_cold      75836
p15799coll8_3710.jpeg                 hot_and_cold       3710
p15799coll8_49866.jpg                 hot_and_cold      49866
p15799coll8_87649.jpeg                hot_and_cold      87649
p15799coll8_3655.jpeg   SCAN ERROR / NO PREDICTION       3655


 97%|█████████▋| 29/30 [5:04:50<09:37, 577.92s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx


Processing OCR: 100%|██████████| 2179/2179 [08:53<00:00,  4.08it/s]


year_built segment processed data head:
                       year_built.num
p15799coll8_3710.jpeg            1905
p15799coll8_14426.jpeg           1895
p15799coll8_75836.jpeg             90
p15799coll8_30529.jpeg           1914
p15799coll8_49866.jpg             190
                       year_built.num  match_ids
p15799coll8_3710.jpeg            1905       3710
p15799coll8_14426.jpeg           1895      14426
p15799coll8_75836.jpeg             90      75836
p15799coll8_30529.jpeg           1914      30529
p15799coll8_49866.jpg             190      49866


100%|██████████| 30/30 [5:13:50<00:00, 627.69s/it]

Data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Updated.xlsx





Final data saved to /Users/xieewenz/BunkerHill-CardScanToData/data/WPA_Bunker_Hill_Curated_Final.xlsx
