In [1]:
import ast
import uuid
import pandas as pd
from rentradar.db.duckdb import DuckDBManager
from rentradar.process.process_rentcast_data import RentCastData
from rentradar.utils.utils import string_to_uuid

In this notebook I'll normalize and clean all of the raw data and migrate it to the in-memory DuckDB database.

### Setup

- The following cell creates a database at the specified location

In [2]:
db_manager = DuckDBManager('../rentradar/db/rentradar.db')

- Read in the cached data

In [3]:
data = RentCastData(
    properties = pd.read_csv('../data/raw/cville_properties.csv'),
    long_term_rentals = pd.read_csv('../data/raw/cville_long_term_rentals.csv'),
    sale_listings = pd.read_csv('../data/raw/cville_sale_listings.csv'),
    markets_current = pd.read_csv('../data/raw/cville_current_market_stats.csv'),
    markets_history = pd.read_csv('../data/raw/cville_historical_market_stats.csv')
)

In [4]:
data.properties['property_id'] = data.properties['id'].apply(string_to_uuid)

In [5]:
data.zipcodes

array([22904, 22903, 22902, 22901, 22911, 22906, 22942, 22932, 24590,
       22936, 22940, 22947, 22968, 22923])

### Counties table

In [6]:
data.counties

array(['Albemarle', 'Charlottesville City', 'Charlottesville City County',
       'Albemarle County', 'Louisa County', 'Charlottesville',
       'Fluvanna County', nan, 'Fluvanna', 'Greene County'], dtype=object)

In [7]:
county_normalization_map = {
    'Albemarle': 'Albemarle County',
    'Albemarle County': 'Albemarle County',
    'Charlottesville City': 'Charlottesville City',
    'Charlottesville City County': 'Charlottesville City',
    'Charlottesville': 'Charlottesville City',
    'Louisa County': 'Louisa County',
    'Fluvanna County': 'Fluvanna County',
    'Fluvanna': 'Fluvanna County',
    'Greene County': 'Greene County',
}

data.properties['county'] = data.properties['county'].map(county_normalization_map)

In [8]:
data.counties

array(['Albemarle County', 'Charlottesville City', 'Louisa County',
       'Fluvanna County', nan, 'Greene County'], dtype=object)

In [9]:
data.properties[pd.isna(data.properties['county'])]

Unnamed: 0,id,formattedAddress,addressLine1,addressLine2,city,state,zipCode,county,latitude,longitude,...,yearBuilt,assessorID,legalDescription,subdivision,zoning,taxAssessments,propertyTaxes,owner,ownerOccupied,property_id
14815,"211-Cream-St,-Apt-401,-Charlottesville,-VA-22903","211 Cream St, Apt 401, Charlottesville, VA 22903",211 Cream St,Apt 401,Charlottesville,VA,22903,,38.032588,-78.489985,...,2007.0,,,,,"{'2018': {'value': 429000, 'land': 65200, 'imp...","{'2018': {'total': 4102}, '2019': {'total': 38...",,,0a506b15-fce6-5daa-9f2e-5ffdefd4f247
17299,"1211-Rugby-Rd,-Charlottesville,-VA-22903","1211 Rugby Rd, Charlottesville, VA 22903",1211 Rugby Rd,,Charlottesville,VA,22903,,38.048805,-78.492835,...,1955.0,,,,,"{'2018': {'value': 523200, 'land': 230000, 'im...","{'2018': {'total': 5215}, '2019': {'total': 54...",,,c4c20619-6a2c-5496-8903-d1dc046a763f


In [10]:
data.properties[data.properties['zipCode'] == 22903].head(2)

Unnamed: 0,id,formattedAddress,addressLine1,addressLine2,city,state,zipCode,county,latitude,longitude,...,yearBuilt,assessorID,legalDescription,subdivision,zoning,taxAssessments,propertyTaxes,owner,ownerOccupied,property_id
8,"4-University-Cir,-Apt-4,-Charlottesville,-VA-2...","4 University Cir, Apt 4, Charlottesville, VA 2...",4 University Cir,Apt 4,Charlottesville,VA,22903,Charlottesville City,38.041914,-78.500827,...,,,,,,,,,,20533238-e85d-51d4-97e6-712fe2699a84
9,"6-University-Cir,-Apt-4,-Charlottesville,-VA-2...","6 University Cir, Apt 4, Charlottesville, VA 2...",6 University Cir,Apt 4,Charlottesville,VA,22903,Charlottesville City,38.0425,-78.500857,...,,,,,,,,,,d1cce5db-bbc8-599b-b2dc-3accccd73449


In [11]:
data.properties.loc[[14815, 17299], 'county'] = 'Charlottesville City'
data.properties[pd.isna(data.properties['county'])]

Unnamed: 0,id,formattedAddress,addressLine1,addressLine2,city,state,zipCode,county,latitude,longitude,...,yearBuilt,assessorID,legalDescription,subdivision,zoning,taxAssessments,propertyTaxes,owner,ownerOccupied,property_id


In [12]:
unique_zipcodes_per_county = data.properties.groupby('county')['zipCode'].nunique()
unique_zipcodes_per_county

county
Albemarle County        12
Charlottesville City     9
Fluvanna County          3
Greene County            1
Louisa County            1
Name: zipCode, dtype: int64

In [13]:
county_ids = []
for c in data.counties:
    county_ids.append(string_to_uuid(c))

counties_df = pd.DataFrame({"id":county_ids, "county":data.counties})

In [14]:
counties_df

Unnamed: 0,id,county
0,cd1120ac-f345-5b53-baab-74449599067b,Albemarle County
1,52c07453-ee28-5c0d-978d-d403c78b58af,Charlottesville City
2,84328055-b1cf-5279-a09a-e98f63e3d09b,Louisa County
3,ff8d11db-883a-5f59-af56-02105632d7f8,Fluvanna County
4,6780b594-0747-5b1c-adf0-6b8809c8cdf0,Greene County


In [15]:
db_manager.table_from_dataframe(counties_df, 'counties')

  df.to_sql(table_name, self.conn, if_exists="replace", index=False)
2024-02-25 00:04:30,504 - rentradar.db.duckdb - INFO - Table 'counties' created from DataFrame


Confirm it worked

In [16]:
db_manager.execute_query('SELECT * FROM counties')

2024-02-25 00:04:33,801 - rentradar.db.duckdb - INFO - Executed query: SELECT * FROM counties


Unnamed: 0,id,county
0,cd1120ac-f345-5b53-baab-74449599067b,Albemarle County
1,52c07453-ee28-5c0d-978d-d403c78b58af,Charlottesville City
2,84328055-b1cf-5279-a09a-e98f63e3d09b,Louisa County
3,ff8d11db-883a-5f59-af56-02105632d7f8,Fluvanna County
4,6780b594-0747-5b1c-adf0-6b8809c8cdf0,Greene County


### Property types table

In [18]:
data.properties.propertyType.unique()

array([nan, 'Apartment', 'Condo', 'Duplex-Triplex', 'Land',
       'Manufactured', 'Miscellaneous', 'Multi Family', 'Multi-Family',
       'Single Family', 'Townhouse', 'Vacant'], dtype=object)

In [19]:
property_type_map = {
    'Multi Family': 'Multi-Family',
    'Single Family': 'Single-Family',
    'Duplex-Triplex': 'Multi-Family',
}
data.properties['propertyType'] = data.properties['propertyType'].replace(property_type_map)

In [21]:
property_types = data.properties.propertyType.unique()
property_types = [x for x in property_types if not pd.isna(x)]
property_types

['Apartment',
 'Condo',
 'Multi-Family',
 'Land',
 'Manufactured',
 'Miscellaneous',
 'Single-Family',
 'Townhouse',
 'Vacant']

In [22]:
property_type_ids = []
for p in property_types:
    property_type_ids.append(string_to_uuid(p))

property_type_descriptions = [
    "A commercial multi-family building or apartment complex (5+ units)",
    "A single unit in a condominium development or building, which is part of a homeowner’s association (HOA)",
    "A residential multi-family building (2-4 units)",
    "A single parcel of vacant, undeveloped land",
    "A pre-fabricated or mobile home, typically constructed at a factory",
    "A miscellaneous property type",
    "A detached, single-family property",
    "A single-family property that shares walls with other adjacent homes, and is typically part of a homeowner’s association (HOA)",
    "A property that is unoccupied"
]

property_types_df = pd.DataFrame({"id":property_type_ids, "propertyType":property_types, "description":property_type_descriptions})

db_manager.table_from_dataframe(property_types_df, 'property_types')

  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


In [23]:
db_manager.execute_query('SELECT * FROM property_types')

Unnamed: 0,id,propertyType,description
0,696f487f-8604-52ae-b450-2e9fe9075f34,Apartment,A commercial multi-family building or apartmen...
1,e6bd4a8e-d5c3-55ec-8b15-d64daedbb5f2,Condo,A single unit in a condominium development or ...
2,1adb7637-5e3e-5de1-8206-6a5b698af656,Multi-Family,A residential multi-family building (2-4 units)
3,0f366103-9b27-5a94-86e7-e3875d7cb873,Land,"A single parcel of vacant, undeveloped land"
4,fbc1533c-5c67-5893-b9c2-0567f0755cdf,Manufactured,"A pre-fabricated or mobile home, typically con..."
5,277c3ebc-87ce-5991-88b3-c1d264b51d69,Miscellaneous,A miscellaneous property type
6,f34e0b29-6f03-5819-95e3-e96f579d4106,Single-Family,"A detached, single-family property"
7,056ac94a-b8d4-5ba3-a244-740e9b78ac33,Townhouse,A single-family property that shares walls wit...
8,d811e570-3d63-51e4-a59d-5cae303322c3,Vacant,A property that is unoccupied


### Tax assessments table

In [25]:
tax_assessments = data.properties[['property_id', 'taxAssessments']]

In [28]:
tax_assessments = tax_assessments.explode('taxAssessments')
tax_assessments.dropna(subset=['taxAssessments'], inplace=True)
tax_assessments['taxAssessments'] = tax_assessments['taxAssessments'].apply(ast.literal_eval)

rows = []
for idx, row in tax_assessments.iterrows():
    for year, assessment in row['taxAssessments'].items():
        rows.append({
            'property_id': row['property_id'],
            'year': year,
            'total_value': assessment.get('value', None),
            'land_value': assessment.get('land', None),
            'improvements_value': assessment.get('improvements', None)
        })

tax_assessments = pd.DataFrame(rows)

tax_assessments['assessment_id'] = tax_assessments.apply(
    lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['property_id']) + str(row['year'])),
    axis=1
)

tax_assessments = tax_assessments[['assessment_id', 'property_id', 'year', 'total_value', 'land_value', 'improvements_value']]

db_manager.table_from_dataframe(tax_assessments, 'tax_assessments')

  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


In [30]:
db_manager.execute_query('SELECT * FROM tax_assessments').head()

Unnamed: 0,assessment_id,property_id,year,total_value,land_value,improvements_value
0,98d21fd0-bde0-5880-99fd-d5c57c0553fa,be793931-39ee-5c0c-803b-bde70a082871,2023,281400.0,281300.0,100.0
1,283a33f7-b245-507e-9d8b-16e6da32a55d,f4983b4c-3991-569f-8291-73134d413851,2020,44053100.0,4400000.0,39653100.0
2,c5cd6269-3b51-5d32-808c-21a0740cc912,109dbee5-149b-5856-b87e-1e7961a423a9,2022,57136000.0,4480000.0,52656000.0
3,47aad8b6-23fd-5b41-ab33-e9b9656ad9e1,109dbee5-149b-5856-b87e-1e7961a423a9,2023,73294096.0,5320000.0,67974096.0
4,d8391a7b-6ede-536c-8e99-b51274a5aff2,a8a4e290-72b9-57af-8a45-44030f66d7c6,2019,563300.0,177000.0,386300.0


### Property taxes table

In [31]:
property_taxes = data.properties[['property_id', 'propertyTaxes']]
property_taxes.dropna(subset=['propertyTaxes'], inplace=True)
property_taxes['propertyTaxes'] = property_taxes['propertyTaxes'].apply(ast.literal_eval)
rows = []
for idx, row in property_taxes.iterrows():
    for year, assessment in row['propertyTaxes'].items():
        rows.append({
            'property_id': row['property_id'],
            'year': year,
            'total': assessment.get('total', None),
        })
property_taxes = pd.DataFrame(rows)

property_taxes['property_tax_id'] = property_taxes.apply(
    lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['property_id']) + str(row['year'])),
    axis=1
)
property_taxes = property_taxes[['property_tax_id', 'property_id', 'year', 'total']]
db_manager.table_from_dataframe(property_taxes, 'property_taxes')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  property_taxes.dropna(subset=['propertyTaxes'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  property_taxes['propertyTaxes'] = property_taxes['propertyTaxes'].apply(ast.literal_eval)
  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


In [32]:
db_manager.execute_query('SELECT * FROM property_taxes').head()

Unnamed: 0,property_tax_id,property_id,year,total
0,a9a2b0c2-f9d0-5bff-a416-d15d06c1bac3,be793931-39ee-5c0c-803b-bde70a082871,2022,2403
1,a55ab43f-4ae0-5fb2-ad31-2e3b919b267f,f4983b4c-3991-569f-8291-73134d413851,2019,366281
2,0b109fe6-1048-5806-a71b-18c91b9f9e7f,109dbee5-149b-5856-b87e-1e7961a423a9,2021,435435
3,c5cd6269-3b51-5d32-808c-21a0740cc912,109dbee5-149b-5856-b87e-1e7961a423a9,2022,487941
4,d8391a7b-6ede-536c-8e99-b51274a5aff2,a8a4e290-72b9-57af-8a45-44030f66d7c6,2019,2748


### Property features table

In [7]:
property_features = data.properties[['property_id', 'features']]
property_features.dropna(subset=['features'], inplace=True)
property_features['features'] = property_features['features'].apply(ast.literal_eval)
property_features = property_features[property_features['features'].apply(lambda x: bool(x))]
property_features_expanded = pd.json_normalize(property_features['features'])
property_features = property_features.join(property_features_expanded)
property_features.drop(columns=['features'], inplace=True)
data.properties.drop(columns=['features'], inplace=True)
properties_subset = data.properties[['property_id', 'bedrooms', 'bathrooms', 'squareFootage', 'lotSize']]
property_features = pd.merge(properties_subset, property_features, on='property_id', how='outer')
db_manager.table_from_dataframe(property_features, 'property_features')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  property_features.dropna(subset=['features'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  property_features['features'] = property_features['features'].apply(ast.literal_eval)
  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


In [8]:
db_manager.execute_query('SELECT * FROM property_features').head()

Unnamed: 0,property_id,bedrooms,bathrooms,squareFootage,lotSize,floorCount,garage,garageType,architectureType,exteriorType,...,unitCount,garageSpaces,roofType,foundationType,roomCount,fireplace,fireplaceType,pool,poolType,viewType
0,0002939e-49d9-5578-b0fd-2bdac3057bae,,,,,,,,,,...,,,,,,,,,,
1,00075623-f6c9-54ba-a326-367830d9d027,,1.0,722.0,,2.0,,,Townhouse,Brick,...,1.0,,Composition Shingle,,6.0,1.0,Prefab,,,
2,00098105-7a85-5d19-9dc5-630ba94b6a4f,3.0,4.0,4228.0,206474.0,,,,,,...,,,,,,,,,,
3,000d73a9-4b7d-5169-8e7f-fcd1fd1440da,,,,,,,,,,...,,,,,,,,,,
4,000dac9e-f13e-5da8-9770-91b5aa0478b9,,,,,,,,,,...,,,,,,,,,,


### Property owners table

In [10]:
property_owners = data.properties[['property_id', 'owner']]
property_owners.dropna(subset=['owner'], inplace=True)
property_owners['owner'] = property_owners['owner'].apply(ast.literal_eval)
property_owners_expanded = pd.json_normalize(property_owners['owner'])
property_owners.reset_index(inplace=True)
property_owners = property_owners.join(property_owners_expanded)
property_owners.drop(columns=['owner', 'index'], inplace=True)
max_owners = property_owners['names'].apply(len).max()

for i in range(max_owners):
    property_owners[f'owner{i+1}'] = property_owners['names'].apply(lambda x: x[i] if i < len(x) else None)

property_owners.owner2.isnull().sum()
property_owners.rename(columns={'owner1':'owner'}, inplace=True)
property_owners['owner_id'] = property_owners.apply(
    lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['owner'])),
    axis=1
)
property_owners = property_owners[['owner_id', 'property_id', 'owner']]
db_manager.table_from_dataframe(property_owners, 'property_owners')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  property_owners.dropna(subset=['owner'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  property_owners['owner'] = property_owners['owner'].apply(ast.literal_eval)


In [26]:
db_manager.execute_query('SELECT property_id, owner_id FROM property_owners').head()

Unnamed: 0,property_id,owner_id
0,be793931-39ee-5c0c-803b-bde70a082871,548214dc-a57a-5937-8255-a22348e7eb3e
1,f4983b4c-3991-569f-8291-73134d413851,38a0b4d7-c999-543c-a788-fa5b651f2750
2,109dbee5-149b-5856-b87e-1e7961a423a9,5fccf1eb-86f9-566e-a2fb-5d3e98ba3d16
3,a8a4e290-72b9-57af-8a45-44030f66d7c6,4fd50534-6bd2-5182-b4c8-18f962eb44af
4,15fe5def-b66a-580e-a5b3-3994247571f7,06ebb77a-ab8b-50cc-ba4b-18df2bbdbb0d


### Properties table

In [27]:
properties = data.properties

In [29]:
properties.columns

Index(['id', 'formattedAddress', 'addressLine1', 'addressLine2', 'city',
       'state', 'zipCode', 'county', 'latitude', 'longitude', 'lastSaleDate',
       'lastSalePrice', 'bedrooms', 'bathrooms', 'squareFootage',
       'propertyType', 'lotSize', 'yearBuilt', 'assessorID',
       'legalDescription', 'subdivision', 'zoning', 'taxAssessments',
       'propertyTaxes', 'owner', 'ownerOccupied', 'property_id'],
      dtype='object')

In [30]:
properties.drop(columns=['owner', 'taxAssessments', 'propertyTaxes', 'bedrooms', 'bathrooms', 'squareFootage', 'lotSize'], inplace=True)

In [34]:
properties = properties[['property_id', 'id', 'formattedAddress', 'zipCode', 'county', 'subdivision', 'latitude', 'longitude', 'propertyType', 'ownerOccupied', 'yearBuilt', 'lastSaleDate', 'lastSalePrice', 'zoning','assessorID', 'legalDescription']]

In [35]:
db_manager.table_from_dataframe(properties, 'properties')

  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


### Long term rentals

In [37]:
data.long_term_rentals.columns

Index(['id', 'formattedAddress', 'addressLine1', 'addressLine2', 'city',
       'state', 'zipCode', 'county', 'latitude', 'longitude', 'propertyType',
       'bedrooms', 'bathrooms', 'status', 'price', 'listedDate', 'removedDate',
       'createdDate', 'lastSeenDate', 'daysOnMarket', 'squareFootage',
       'yearBuilt', 'lotSize'],
      dtype='object')

In [38]:
ltr = data.long_term_rentals
ltr['property_id'] = ltr['id'].apply(string_to_uuid)

In [40]:
ltr = ltr.drop(columns=['formattedAddress', 'addressLine1', 'addressLine2', 'city',
       'state', 'zipCode', 'county', 'latitude', 'longitude', 'propertyType',
       'bedrooms', 'bathrooms', 'squareFootage','yearBuilt', 'lotSize'], axis=1)

In [42]:
ltr = ltr[['property_id', 'id', 'price', 'status', 'daysOnMarket', 'listedDate', 'createdDate', 'lastSeenDate', 'removedDate']]

In [43]:
db_manager.table_from_dataframe(ltr, 'long_term_rentals')

  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


### Sale listings

In [45]:
data.sale_listings.columns

Index(['id', 'formattedAddress', 'addressLine1', 'addressLine2', 'city',
       'state', 'zipCode', 'county', 'latitude', 'longitude', 'propertyType',
       'bedrooms', 'bathrooms', 'squareFootage', 'lotSize', 'yearBuilt',
       'status', 'price', 'listedDate', 'removedDate', 'createdDate',
       'lastSeenDate', 'daysOnMarket'],
      dtype='object')

In [46]:
sl = data.sale_listings
sl['property_id'] = sl['id'].apply(string_to_uuid)
sl = sl[['property_id', 'id', 'status', 'price', 'listedDate', 'removedDate', 'createdDate', 'lastSeenDate', 'daysOnMarket']]
db_manager.table_from_dataframe(sl, 'sale_listings')

  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


### Current market stats

In [48]:
cms = data.markets_current
db_manager.table_from_dataframe(cms, 'current_market_stats')

  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


### Historic market stats

In [51]:
hms = data.markets_history
db_manager.table_from_dataframe(hms, 'historic_market_stats')

  df.to_sql(table_name, self.conn, if_exists='replace', index=False)


### Close DB connection

In [17]:
db_manager.close()

2024-02-25 00:04:54,933 - rentradar.db.duckdb - INFO - Database connection closed.
