In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

In [33]:
city_codes_df = pd.read_csv('city_codes.csv')
city_codes_df.head()

Unnamed: 0,city,code,page_type
0,Alamo,4800,page
1,Blackhawk,4600,page
2,Clayton,5900,page
3,Concord,5701,page
4,Clyde,5702,page


In [183]:
columns = ['City', 'Townhouse-Condo Attached', 'Single-Family Detached', 
           'New Listings', 'Pending Sales',
           'Closed Sales', 'Days on Market Until Sale', 'Median Sales Price',
           'Average Sales Price', 'Percent of Original List Price Received',
           'Inventory of Homes for Sale', 'Year', 'Month', 'Month_num']

df = pd.DataFrame(columns=columns)
df

Unnamed: 0,City,Townhouse-Condo Attached,Single-Family Detached,New Listings,Pending Sales,Closed Sales,Days on Market Until Sale,Median Sales Price,Average Sales Price,Percent of Original List Price Received,Inventory of Homes for Sale,Year,Month,Month_num


In [83]:
months = {
    '01': "January", 
    '02': "February", 
    '03': "March", 
    '04': "April", 
    '05': "May", 
    '06': "June",
    '07': "July", 
    '08': "August", 
    '09': "September", 
    '10': "October", 
    '11': "November", 
    '12': "December"
}

In [123]:
house_types = {0 : 'Single-Family Detached',
               1 : 'Townhouse-Condo Attached'}

In [182]:
def loadCityData(city, city_codes_df, month, year):
    '''
    Download data from https://ccartoday.com/market-statistics/
    INPUT:
    city - srting
    city_codes_df - pandas DataFrame
    month - int, 1-12
    year - int, 2011-2021
    house_type - 0 if Single-Family Detached
                 1 if Townhouse-Condo Attached
    
    OUTPUT:
    single_df - pandas DataFrame
    
    '''
    city_code = city_codes_df[city_codes_df['city'] == city].iloc[0,1]
    page_type = city_codes_df[city_codes_df['city'] == city].iloc[0,2]
    
    central_stat_url = f"http://main.ccartoday.com/index.php/{page_type}/area/{month}/{year}/{city_code}"
    
    table_df = pd.read_html(central_stat_url)
    single_df = table_df[0][1:].transpose()[:3]

    single_df = single_df.rename(columns=single_df.iloc[0]).drop(single_df.index[0])

    single_df['Single-Family Detached'] = 1
    single_df['Townhouse-Condo Attached'] = 0

    condo_df = table_df[1][1:].transpose()[:3]

    condo_df = condo_df.rename(columns=condo_df.iloc[0]).drop(condo_df.index[0])

    condo_df['Single-Family Detached'] = 0
    condo_df['Townhouse-Condo Attached'] = 1
    
    single_df = single_df[1:]
    condo_df = condo_df[1:]

    single_df = pd.concat([single_df, condo_df], ignore_index=True)

    single_df['City'] = city
    single_df['Month_num'] = month
    single_df['Month'] = months[month]
    single_df['Year'] = year
    single_df['Median'] = single_df['Median Sales Price'].apply(lambda x: x.replace('$', '') \
                                                                .replace(',', '')).astype('int32')
    single_df['Average'] = single_df['Average Sales Price'].apply(lambda x: x.replace('$', '') \
                                                                  .replace(',', '')).astype('int32')
    cols=['New Listings', 'Closed Sales', 'Days on Market Until Sale', 'Inventory of Homes for Sale']
    single_df[cols] = single_df[cols].apply(pd.to_numeric, errors='coerce')
    single_df['Percent of Original List Price Received'] = single_df['Percent of Original List Price Received'] \
                                                                    .apply(pd.to_numeric, errors='coerce')
    return single_df

In [180]:
def loadYearAndMonthData(city, city_codes_df, month, year, df):
    '''
    Checking if city, month, year and house type combination exists in DataFrame.
    If yes, return df without changing.
    If no, add new row and return df.
    
    INPUT:
    city - srting
    city_codes_df - pandas DataFrame
    month - int, 1-12
    year - int, 2011-2021
    house_type - 0 if Single-Family Detached
                 1 if Townhouse-Condo Attached
    
    OUTPUT:
    df - pandas DataFrame
    
    '''
    a = np.array([city, month, year])
    matches = df[(df[['City', 'Month_num', 'Year']] == a).all(axis=1)]
    if matches.empty:
        new_df = loadCityData(city, city_codes_df, month, year)
        df = pd.concat([df, new_df], ignore_index=True)
    return df

In [186]:
df = loadYearAndMonthData('Concord', city_codes_df, '02', '2021', df)

df

Unnamed: 0,City,Townhouse-Condo Attached,Single-Family Detached,New Listings,Pending Sales,Closed Sales,Days on Market Until Sale,Median Sales Price,Average Sales Price,Percent of Original List Price Received,Inventory of Homes for Sale,Year,Month,Month_num,Median,Average
0,Concord,0,1,93,,63,13,"$740,000","$755,697",105.29,39,2021,January,1,740000.0,755697.0
1,Concord,1,0,32,,24,18,"$352,500","$363,025",101.53,20,2021,January,1,352500.0,363025.0
2,Concord,0,1,86,,77,19,"$775,000","$778,033",106.52,53,2021,February,2,775000.0,778033.0
3,Concord,1,0,29,,22,24,"$358,375","$371,830",101.97,20,2021,February,2,358375.0,371830.0


In [158]:
a = np.array(['Concord', '01', '2021'])
matches = df[(df[['City', 'Month_num', 'Year']] == a).all(axis=1)]
print(matches.empty)

False


In [122]:
year = 2021

# for month in range(1, 4):
#     for city, city_code in central_city_codes_dict.items():
#         df = createCentralStatdf('page', month, year, city, city_code, df)
        
#     for city, city_code in west_city_codes_dict.keys():
#         df = createCentralStatdf('wccar', month, year, city, city_code, df)

import threading

threads = []
result = {}
for month in range(1, 12):
    for city, city_code in central_city_codes_dict.items():
        x = threading.Thread(target=createCentralStatdf, args=('page', month, year, city, city_code, result))
        threads.append(x)
        x.start()

#     for city, city_code in west_city_codes_dict.items():
#         x = threading.Thread(target=createCentralStatdf, args=('wccar', month, year, city, city_code, result))
#         threads.append(x)
#         x.start()

for _, thread in enumerate(threads):
    thread.join()

for k,v in result.items():
    df = pd.concat([df, v], ignore_index=True)

Unnamed: 0,City,Townhouse-Condo Attached,Single-Family Detached,New Listings,Pending Sales,Closed Sales,Days on Market Until Sale,Median Sales Price,Average Sales Price,Percent of Original List Price Received,Inventory of Homes for Sale,Year,Month


In [None]:
# 