In [1]:
import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({'font.size': 22})
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [3]:
scraped_df = pd.read_csv('raw_scraped_data.csv')
scraped_df_f = scraped_df[scraped_df['raw_history'].notna()]
len(scraped_df_f) / len(scraped_df)

0.9993744267177359

In [4]:
len(scraped_df_f)

260398

In [5]:
scraped_df.head()

Unnamed: 0,external_id,beds,baths,sqft,property_type,raw_history
0,13959615,7,3.0,-,Multi-Family (2-4 Unit),"[{'event': 'Sold (MLS)', 'date': datetime.date..."
1,13398829,9,3.0,-,Multi-Family (2-4 Unit),"[{'event': 'Price Changed', 'date': datetime.d..."
2,13236623,6,3.0,-,Multi-Family (2-4 Unit),"[{'event': 'Contingent', 'date': datetime.date..."
3,17619341,5,2.5,2880,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': da..."
4,13809758,4,2.5,1638,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': da..."


In [6]:
scraped_df_f['raw_history'] = scraped_df_f['raw_history'].apply(lambda x: eval(x))

In [7]:
listing_info_df = pd.read_csv('final_redfin_listings_info.csv')
listing_info_df['created_date'] = listing_info_df['created_date'].apply(
        lambda date: datetime.datetime.strptime(date, '%Y-%m-%d').date())

In [8]:
combined_df = listing_info_df.merge(scraped_df_f, on='external_id', how='inner')

In [9]:
len(combined_df)

256742

In [10]:
len(listing_info_df)

264139

In [11]:
len(combined_df) / len(listing_info_df)

0.9719958052389083

In [12]:
combined_df.head()

Unnamed: 0,external_id,created_date,address,city,state,zipcode,market_area,null_redfin_price_estimate,buyside_commission_range,created_month_index,price_range,year_built_range,price_difference_pct_range,avg_views_5,avg_views_10,avg_views_30,avg_views_100,avg_views_200,redfin_price_estimate,opendoor,zillow,offerpad,hoa,beds_range,baths_range,sqft_range,property_category_index,brokerage_index,brokerage_listings_pct_range,agent_name,brokerage,beds,baths,sqft,property_type,raw_history
0,13663942,2022-01-14,10 Terrace Ln,Des Plaines,IL,60019,Chicago,1,0,7,6,4,1,1888.153755,1684.743534,1518.060252,1231.980749,1049.688773,,0,0,0,0,3,2,1,0,0,8,Kim Sibley,baird & warner,4,2.0,1273,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20..."
1,23331416,2021-07-27,208 Meadowbrook Dr,Bolingbrook,IL,60440,Chicago,1,0,1,3,4,1,1243.320956,1097.936268,1113.5089,1262.284556,1204.498678,,0,0,0,0,2,2,3,0,0,8,Kathy McVeigh,baird & warner,3,2.0,1811,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20..."
2,13847488,2021-08-03,303 E Ivy Ln,Arlington Heights,IL,60004,Chicago,1,0,2,6,4,1,860.582481,943.937557,938.306499,1243.650506,1115.04655,,0,0,0,0,2,2,1,0,0,8,Shaunna Burhop,baird & warner,3,2.0,1540,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20..."
3,21765334,2021-11-10,202 Hemlock Ave,Romeoville,IL,60446,Chicago,1,0,5,2,4,1,896.332022,1171.125672,986.257433,896.232247,992.97973,,0,0,0,0,3,2,0,0,0,8,Lynn Hayes,baird & warner,4,2.0,1170,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20..."
4,13894291,2021-07-20,458 E Carpenter Dr,Palatine,IL,60074,Chicago,1,0,1,7,4,1,923.631107,1012.231276,915.941123,927.873459,803.631951,,0,0,0,0,3,3,4,0,0,8,Cindy Eich,baird & warner,4,3.5,2468,Single Family Residential,"[{'event': 'Sold (MLS)', 'date': 2021-08-26, '..."


In [13]:
def get_event_type(text):
    if any(phrase in text for phrase in {'Pending', 'Under Contract', 'Contingent'}):
        return 'pending'
    if 'Sold' in text:
        return 'sold'
    if any(phrase in text for phrase in {'Listed', 'Relisted'}):
        return 'listed'
    if 'Listing Removed' in text:
        return 'listing_removed'
    if 'Coming Soon' in text:
        return 'coming_soon'
    return None

In [14]:
def get_listing_history(row):
    redfin_events = row['raw_history']
    scraped_date = row['created_date']
    
    event_type_to_date = {}
    event_lst = []
    subtract_from_dom = 0
    sale_price = None
    for i, redfin_event in enumerate(redfin_events):
        event_type = get_event_type(redfin_event['event'])
        event_date = redfin_event['date']

        # If the listing was scraped after the sale date, it must be that this sale date 
        # is referring to a previous transaction of the given property
        if event_type == 'sold' and scraped_date > event_date:
            return ['no_sale'] * 6
        
        # similarly, if the listing was scraped more than 9 days after the listing went under contract,
        # this pending date is almost certaintly referring to a previous transaction of the given property 
        if event_type == 'pending' and scraped_date > event_date + datetime.timedelta(days=9):
            return ['no_sale'] * 6

        if event_type == 'sold':
            event_type_to_date = {}
            event_lst = []
            sale_price = redfin_event.get('price')

        if event_type in {'sold', 'pending'}:
            event_type_to_date[event_type] = event_date

        coming_soon_scenario = False
        
        # check if listing was scraped while the listing was in the coming soon phase
        try:
            for j in [1, 2]:
                next_event = redfin_events[i + j]
                next_event_type = get_event_type(next_event['event'])
                if next_event_type == 'coming_soon':
                    coming_soon_date = next_event['date']
                    if scraped_date >= coming_soon_date:
                        coming_soon_scenario = True
                        break
                        
        except IndexError:
            pass
        
        # check that the listing was scraped while the listing was in the coming soon phase 
        if event_type == 'listed' and (scraped_date >= event_date or coming_soon_scenario):
            # check that the given listing went under contract and ended up selling within 120 days of going
            # under contract
            if (all(event_type in event_type_to_date for event_type in {'sold', 'pending'}) and
                (event_type_to_date['sold'] - event_type_to_date['pending']).days < 120):
                list_date = event_date
                list_to_pending_days = (event_type_to_date['pending'] - list_date).days + 1

                removed_df = pd.DataFrame(event_lst, columns=['event', 'date'])
                
                # check if property was delisted and, if so, calculate the number of days it was delisted
                unique_events = set(removed_df['event'])
                if all(event in unique_events for event in {'listed', 'listing_removed'}):
                    removed_df = removed_df.sort_values('date').reset_index(drop=True)
                    order_of_events = ['listing_removed', 'listed']
                    
                    # only consider times when property was delisted between list date and pending date
                    removed_df = removed_df[(list_date < removed_df['date']) & 
                                            (removed_df['date'] < event_type_to_date['pending'])]
                    
                    # iterate through each time the property was delisted
                    nrow = len(removed_df)
                    if nrow % 2 == 0:
                        for i in range(int(nrow / 2)):
                            df_f = removed_df.iloc[2 * i: 2 * i + 2]
                            events = list(df_f['event'])
                            if events == order_of_events:
                                dates = list(df_f['date'])
                                subtract_from_dom += (dates[1] - dates[0]).days

                dom = list_to_pending_days - subtract_from_dom
                
                return [list_date, event_type_to_date['pending'], event_type_to_date['sold'], 
                        list_to_pending_days, dom, sale_price]
            
        elif event_type in {'listed', 'listing_removed'}:
            event_lst.append([event_type, event_date])
    
    return ['no_sale'] * 6 

In [15]:
new_cols = ['list_date', 'pending_date', 'sale_date', 'list_to_pending_days', 'dom', 'sale_price']

combined_df[new_cols] = combined_df.apply(get_listing_history, axis=1, result_type='expand')

In [16]:
combined_df.head()

Unnamed: 0,external_id,created_date,address,city,state,zipcode,market_area,null_redfin_price_estimate,buyside_commission_range,created_month_index,price_range,year_built_range,price_difference_pct_range,avg_views_5,avg_views_10,avg_views_30,avg_views_100,avg_views_200,redfin_price_estimate,opendoor,zillow,offerpad,hoa,beds_range,baths_range,sqft_range,property_category_index,brokerage_index,brokerage_listings_pct_range,agent_name,brokerage,beds,baths,sqft,property_type,raw_history,list_date,pending_date,sale_date,list_to_pending_days,dom,sale_price
0,13663942,2022-01-14,10 Terrace Ln,Des Plaines,IL,60019,Chicago,1,0,7,6,4,1,1888.153755,1684.743534,1518.060252,1231.980749,1049.688773,,0,0,0,0,3,2,1,0,0,8,Kim Sibley,baird & warner,4,2.0,1273,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2022-01-13,2022-01-16,2022-02-28,4,4,385000
1,23331416,2021-07-27,208 Meadowbrook Dr,Bolingbrook,IL,60440,Chicago,1,0,1,3,4,1,1243.320956,1097.936268,1113.5089,1262.284556,1204.498678,,0,0,0,0,2,2,3,0,0,8,Kathy McVeigh,baird & warner,3,2.0,1811,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2021-07-15,2021-07-18,2021-09-09,4,4,275000
2,13847488,2021-08-03,303 E Ivy Ln,Arlington Heights,IL,60004,Chicago,1,0,2,6,4,1,860.582481,943.937557,938.306499,1243.650506,1115.04655,,0,0,0,0,2,2,1,0,0,8,Shaunna Burhop,baird & warner,3,2.0,1540,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2021-07-23,2021-10-22,2021-12-03,92,92,365000
3,21765334,2021-11-10,202 Hemlock Ave,Romeoville,IL,60446,Chicago,1,0,5,2,4,1,896.332022,1171.125672,986.257433,896.232247,992.97973,,0,0,0,0,3,2,0,0,0,8,Lynn Hayes,baird & warner,4,2.0,1170,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2021-11-03,2021-11-09,2021-12-06,7,7,225000
4,13894291,2021-07-20,458 E Carpenter Dr,Palatine,IL,60074,Chicago,1,0,1,7,4,1,923.631107,1012.231276,915.941123,927.873459,803.631951,,0,0,0,0,3,3,4,0,0,8,Cindy Eich,baird & warner,4,3.5,2468,Single Family Residential,"[{'event': 'Sold (MLS)', 'date': 2021-08-26, '...",2021-07-09,2021-07-12,2021-08-26,4,4,470000


In [17]:
def get_days_until_sale(row):
    
    scraped_date = row['created_date']
    redfin_events = row['raw_history']
    
    was_listed = False
    
    for redfin_event in redfin_events:
        event_type = get_event_type(redfin_event['event'])
        event_date = redfin_event['date']
        
        # check that the listing was scraped within 30 days of the property being listed
        if event_type == 'listed' and abs((event_date - scraped_date).days) < 30:
            was_listed = True
            break
    
    # if the property was never listed, we exclude it from the analysis
    if not was_listed:
        return None
    
    sold_date = None
    
    for redfin_event in redfin_events:
        event_type = get_event_type(redfin_event['event'])
        event_date = redfin_event['date']

        if event_type == 'sold':
            sold_date = event_date
            
            # If the listing was scarped after the sale date, it must be that this sale date 
            # is referring to a previous transaction of the given property
            if scraped_date > sold_date:
                return 'no_sale'
            
        if event_type == 'listed' and scraped_date >= event_date:
            if sold_date is None:
                return 'no_sale'
            return (sold_date - event_date).days
    
    return 'no_sale'

In [18]:
combined_df['days_until_sale'] = combined_df.apply(get_days_until_sale, axis=1)

In [19]:
days_to_sell_threshold = 120

def did_home_sell(days_until_sale):
    if pd.isnull(days_until_sale):
        return None
    if days_until_sale == 'no_sale':
        return 0
    return int(days_until_sale < days_to_sell_threshold)

In [20]:
combined_df['home_sold'] = combined_df['days_until_sale'].apply(did_home_sell)

In [21]:
combined_df['home_sold'].value_counts(dropna=False) / len(combined_df)

1.0    0.701338
0.0    0.229308
NaN    0.069354
Name: home_sold, dtype: float64

In [22]:
final_df = combined_df.drop(columns=['raw_history'])
final_df.shape

(256742, 43)

In [22]:
final_df.to_csv('sale_outcome_info.csv', index=False)

## Price changes among iBuyers

In [24]:
combined_df.head()

Unnamed: 0,external_id,created_date,address,city,state,zipcode,market_area,null_redfin_price_estimate,buyside_commission_range,created_month_index,price_range,year_built_range,price_difference_pct_range,avg_views_5,avg_views_10,avg_views_30,avg_views_100,avg_views_200,redfin_price_estimate,opendoor,zillow,offerpad,hoa,beds_range,baths_range,sqft_range,property_category_index,brokerage_index,brokerage_listings_pct_range,agent_name,brokerage,beds,baths,sqft,property_type,raw_history,list_date,pending_date,sale_date,list_to_pending_days,dom,sale_price,days_until_sale,home_sold
0,13663942,2022-01-14,10 Terrace Ln,Des Plaines,IL,60019,Chicago,1,0,7,6,4,1,1888.153755,1684.743534,1518.060252,1231.980749,1049.688773,,0,0,0,0,3,2,1,0,0,8,Kim Sibley,baird & warner,4,2.0,1273,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2022-01-13,2022-01-16,2022-02-28,4,4,385000,46,1.0
1,23331416,2021-07-27,208 Meadowbrook Dr,Bolingbrook,IL,60440,Chicago,1,0,1,3,4,1,1243.320956,1097.936268,1113.5089,1262.284556,1204.498678,,0,0,0,0,2,2,3,0,0,8,Kathy McVeigh,baird & warner,3,2.0,1811,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2021-07-15,2021-07-18,2021-09-09,4,4,275000,56,1.0
2,13847488,2021-08-03,303 E Ivy Ln,Arlington Heights,IL,60004,Chicago,1,0,2,6,4,1,860.582481,943.937557,938.306499,1243.650506,1115.04655,,0,0,0,0,2,2,1,0,0,8,Shaunna Burhop,baird & warner,3,2.0,1540,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2021-07-23,2021-10-22,2021-12-03,92,92,365000,133,0.0
3,21765334,2021-11-10,202 Hemlock Ave,Romeoville,IL,60446,Chicago,1,0,5,2,4,1,896.332022,1171.125672,986.257433,896.232247,992.97973,,0,0,0,0,3,2,0,0,0,8,Lynn Hayes,baird & warner,4,2.0,1170,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",2021-11-03,2021-11-09,2021-12-06,7,7,225000,33,1.0
4,13894291,2021-07-20,458 E Carpenter Dr,Palatine,IL,60074,Chicago,1,0,1,7,4,1,923.631107,1012.231276,915.941123,927.873459,803.631951,,0,0,0,0,3,3,4,0,0,8,Cindy Eich,baird & warner,4,3.5,2468,Single Family Residential,"[{'event': 'Sold (MLS)', 'date': 2021-08-26, '...",2021-07-09,2021-07-12,2021-08-26,4,4,470000,48,1.0


In [25]:
def argsort(seq):
    return sorted(range(len(seq)), key=seq.__getitem__)

def sort_raw_history(raw_history):
    dates = [event['date'] for event in raw_history]
    argsorted = argsort(dates)
    sorted_history = [raw_history[i] for i in argsorted]
    return sorted_history

In [26]:
combined_df['sorted_raw_history'] = combined_df['raw_history'].apply(sort_raw_history)

In [27]:
combined_df[combined_df['address'] == '1411 S Estate Ln']

Unnamed: 0,external_id,created_date,address,city,state,zipcode,market_area,null_redfin_price_estimate,buyside_commission_range,created_month_index,price_range,year_built_range,price_difference_pct_range,avg_views_5,avg_views_10,avg_views_30,avg_views_100,avg_views_200,redfin_price_estimate,opendoor,zillow,offerpad,hoa,beds_range,baths_range,sqft_range,property_category_index,brokerage_index,brokerage_listings_pct_range,agent_name,brokerage,beds,baths,sqft,property_type,raw_history,list_date,pending_date,sale_date,list_to_pending_days,dom,sale_price,days_until_sale,home_sold,sorted_raw_history
211,17606761,2021-07-29,1411 S Estate Ln,Lake Forest,IL,60045,Chicago,1,0,1,8,2,1,1026.25277,1024.553551,1221.900229,1391.006129,1356.197275,,0,0,0,0,2,3,4,0,0,8,Bob Krombach,baird & warner,3,3,2579,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",no_sale,no_sale,no_sale,no_sale,no_sale,no_sale,197,0.0,"[{'event': 'Listed', 'date': 2021-05-07, 'pric..."


In [28]:
lst = []
for i, row in combined_df.iterrows():
    scraped_date = row['created_date']
    pending_date = row['pending_date']
    redfin_events = row['sorted_raw_history']
    
    list_date = None
    
    for redfin_event in redfin_events:
        event_type = get_event_type(redfin_event['event'])
        event_date = redfin_event['date']

        # check that the listing was scraped within 30 days of the property being listed
        if event_type == 'listed' and abs((event_date - scraped_date).days) < 30:
            list_date = event_date
            current_price = redfin_event['price']
            break

    if list_date is None:
        continue
    
    end_date = pending_date if pending_date != 'no_sale' else list_date + datetime.timedelta(days=120)
    
    previous_redfin_event = ''
    for redfin_event in redfin_events:
        if str(redfin_event) == str(previous_redfin_event):
            continue
        if redfin_event['event'] == 'Price Changed' and list_date <= redfin_event['date'] <= end_date:
            new_price = redfin_event['price']
            if current_price and new_price: 
                if new_price < current_price:
                    lst.append('price reduction')
                elif new_price > current_price:
                    lst.append('price increase')
                else:
                    lst.append('same price')
            current_price = new_price
        previous_redfin_event = redfin_event

In [29]:
pd.Series(lst).value_counts() / len(lst)

price reduction    0.886126
price increase     0.092834
same price         0.021040
dtype: float64

In [30]:
def get_dom_of_price_drops(row):
    scraped_date = row['created_date']
    pending_date = row['pending_date']
    redfin_events = row['sorted_raw_history']
    
    list_date = None
    
    for redfin_event in redfin_events:
        event_type = get_event_type(redfin_event['event'])
        event_date = redfin_event['date']

        # check that the listing was scraped within 30 days of the property being listed
        if event_type == 'listed' and abs((event_date - scraped_date).days) < 30:
            # print(redfin_event)
            list_date = event_date
            current_price = redfin_event['price']
            break
    
    if list_date is None:
        return None
    
    end_date = pending_date if pending_date != 'no_sale' else list_date + datetime.timedelta(days=120)
    
    price_drop_doms = []
    previous_redfin_event = ''
    for redfin_event in redfin_events:
        if str(redfin_event) == str(previous_redfin_event):
            continue
        previous_redfin_event = redfin_event
        
        if redfin_event['event'] == 'Price Changed' and list_date <= redfin_event['date'] <= end_date:
            new_price = redfin_event['price']
            if current_price and new_price and new_price >= current_price:
                    continue
            price_drop_date = redfin_event['date']
            dom = (price_drop_date - list_date).days
            price_drop_doms.append(dom)
            current_price = new_price
    
    return price_drop_doms

In [31]:
combined_df['price_drops'] = combined_df.apply(get_dom_of_price_drops, axis=1)

In [32]:
combined_df['n_price_drops'] = combined_df['price_drops'].apply(lambda lst: None if lst is None else len(lst))

In [33]:
listed_df = combined_df[combined_df['price_drops'].notna()]

In [34]:
listed_df['n_price_drops'].value_counts(dropna=False) / len(listed_df)

0.0     0.728982
1.0     0.173486
2.0     0.059798
3.0     0.022052
4.0     0.008663
5.0     0.003595
6.0     0.001532
7.0     0.000753
8.0     0.000452
9.0     0.000230
10.0    0.000155
11.0    0.000100
14.0    0.000046
13.0    0.000042
12.0    0.000033
15.0    0.000025
20.0    0.000013
30.0    0.000008
16.0    0.000008
29.0    0.000004
54.0    0.000004
28.0    0.000004
18.0    0.000004
21.0    0.000004
22.0    0.000004
Name: n_price_drops, dtype: float64

In [35]:
listed_df[(listed_df['opendoor'] == 1) & (listed_df['n_price_drops'] == 5)].sample(5, random_state=1)

Unnamed: 0,external_id,created_date,address,city,state,zipcode,market_area,null_redfin_price_estimate,buyside_commission_range,created_month_index,price_range,year_built_range,price_difference_pct_range,avg_views_5,avg_views_10,avg_views_30,avg_views_100,avg_views_200,redfin_price_estimate,opendoor,zillow,offerpad,hoa,beds_range,baths_range,sqft_range,property_category_index,brokerage_index,brokerage_listings_pct_range,agent_name,brokerage,beds,baths,sqft,property_type,raw_history,list_date,pending_date,sale_date,list_to_pending_days,dom,sale_price,days_until_sale,home_sold,sorted_raw_history,price_drops,n_price_drops
74545,30283284,2022-02-01,16218 Hickory Knoll Dr,Houston,TX,77059,Houston,0,1,8,3,2,5,280.337386,255.270654,261.396764,273.893226,259.596343,286975.0,1,0,0,0,2,1,0,0,0,9,Feras Rachid,"opendoor brokerage, llc",3.0,2.0,1504,Single-Family (Co-op),"[{'event': 'Sold (Public Records)', 'date': 20...",no_sale,no_sale,no_sale,no_sale,no_sale,no_sale,480,0.0,"[{'event': 'Sold (Public Records)', 'date': 19...","[45, 59, 73, 87, 101]",5.0
74679,30171294,2021-12-21,5515 Braesvalley Dr,Houston,TX,77096,Houston,0,1,6,6,1,4,139.40833,128.342857,206.53515,171.685758,158.295864,401791.0,1,0,0,0,3,2,2,0,0,9,Feras Rachid,"opendoor brokerage, llc",4.0,2.5,2136,Single-Family (Co-op),"[{'event': 'Sold (MLS)', 'date': 2022-06-27, '...",2021-12-13,2022-05-15,2022-06-27,154,154,,196,0.0,"[{'event': 'Listed', 'date': 2018-11-29, 'pric...","[67, 94, 108, 122, 143]",5.0
23316,3709976,2021-07-29,21018 Lull St,Canoga Park,CA,91305,Los Angeles,0,2,1,5,3,2,1433.0843,1107.499102,1266.937495,1089.458567,1239.364318,871901.0,1,0,0,0,2,2,2,0,0,6,Ben Braksick,opendoor brokerage inc,3.0,2.0,1500,"Single Family, Residential Single-Family","[{'event': 'Sold (MLS)', 'date': 2022-03-03, '...",2021-07-20,2022-01-11,2022-03-03,176,176,773000,226,0.0,"[{'event': 'Sold (Public Records)', 'date': 19...","[16, 30, 59, 86, 107]",5.0
122007,34916830,2021-07-07,705 W Nassau Way,Englewood,CO,80110,Denver,0,1,1,6,2,7,456.711004,474.989685,493.449375,453.270906,590.17904,688772.0,1,0,0,0,3,3,3,0,0,7,Feras Rachid,opendoor brokerage llc,4.0,3.0,2918,Single Family Residential,"[{'event': 'Sold (Public Records)', 'date': 20...",no_sale,no_sale,no_sale,no_sale,no_sale,no_sale,666,0.0,"[{'event': 'Sold (Public Records)', 'date': 19...","[18, 32, 47, 74, 88]",5.0
124921,27969562,2021-07-13,812 W ORANGEWOOD Ave,Phoenix,AZ,85021,Phoenix,1,2,1,6,0,1,608.235991,579.506663,502.117773,453.266518,456.648907,,1,0,0,0,2,2,1,0,0,8,Jacqueline Moore,"opendoor brokerage, llc",3.0,2.0,1500,Single Family Residential,"[{'event': 'Sold (MLS)', 'date': 2021-11-17, '...",2021-07-03,2021-10-24,2021-11-17,114,114,562000,137,0.0,"[{'event': 'Sold (Public Records)', 'date': 19...","[19, 27, 51, 68, 82]",5.0


In [36]:
for ibuyer in ['opendoor', 'zillow', 'offerpad']:
    print(ibuyer)
    ibuyer_df = listed_df[listed_df[ibuyer] == 1]
    display(ibuyer_df['n_price_drops'].value_counts(dropna=False).iloc[:8] / len(ibuyer_df))
    at_least_1_df = ibuyer_df[ibuyer_df['n_price_drops'] > 0]
    print('expected number of price drops:', ibuyer_df['n_price_drops'].mean())
    print('average number of price drops for listings with at least one drop:', 
          at_least_1_df['n_price_drops'].mean())
    print('-------')

opendoor


0.0    0.592396
1.0    0.189265
2.0    0.104557
3.0    0.055633
4.0    0.027677
5.0    0.016774
6.0    0.006150
7.0    0.003355
Name: n_price_drops, dtype: float64

expected number of price drops: 0.8590998043052838
average number of price drops for listings with at least one drop: 2.107681755829904
-------
zillow


0.0    0.563459
1.0    0.191074
2.0    0.075314
4.0    0.059972
3.0    0.055788
5.0    0.023710
6.0    0.013947
8.0    0.009763
Name: n_price_drops, dtype: float64

expected number of price drops: 1.085076708507671
average number of price drops for listings with at least one drop: 2.4856230031948883
-------
offerpad


0.0    0.695721
1.0    0.128368
2.0    0.080824
3.0    0.049128
4.0    0.017433
5.0    0.015848
9.0    0.003170
7.0    0.003170
Name: n_price_drops, dtype: float64

expected number of price drops: 0.6893819334389857
average number of price drops for listings with at least one drop: 2.265625
-------


In [37]:
non_ibuyer_df = listed_df[(listed_df['opendoor'] == 0) & 
                          (listed_df['zillow'] == 0) &
                          (listed_df['offerpad'] == 0)]

In [38]:
non_ibuyer_df['n_price_drops'].value_counts(dropna=False).iloc[:5] / len(non_ibuyer_df)

0.0    0.731666
1.0    0.173312
2.0    0.059010
3.0    0.021362
4.0    0.008192
Name: n_price_drops, dtype: float64

In [39]:
at_least_1_df = non_ibuyer_df[non_ibuyer_df['n_price_drops'] > 0]
print(at_least_1_df['n_price_drops'].mean())

1.5983788001847339


In [40]:
print('expected number of price drops:', non_ibuyer_df['n_price_drops'].mean())

expected number of price drops: 0.42889864151685175


## Price changes by commission rate category

In [41]:
for commission_range in range(4):
    print(f'commission rate category:commission_range)
    comm_category_df = listed_df[listed_df['buyside_commission_range'] == commission_range]
    display(comm_category_df['n_price_drops'].value_counts(dropna=False).iloc[:8] / len(comm_category_df))
    at_least_1_df = comm_category_df[comm_category_df['n_price_drops'] > 0]
    print('expected number of price drops:', comm_category_df['n_price_drops'].mean())
    print('average number of price drops for listings with at least one drop:', 
          at_least_1_df['n_price_drops'].mean())
    print('-------')

0


0.0    0.733405
1.0    0.173368
2.0    0.058610
3.0    0.020829
4.0    0.007675
5.0    0.003129
6.0    0.001347
7.0    0.000649
Name: n_price_drops, dtype: float64

expected number of price drops: 0.42223819076419455
average number of price drops for listings with at least one drop: 1.5838200059898173
-------
1


0.0    0.689714
1.0    0.176137
2.0    0.069709
3.0    0.032180
4.0    0.017508
5.0    0.008349
6.0    0.002999
7.0    0.001216
Name: n_price_drops, dtype: float64

expected number of price drops: 0.5726675853124746
average number of price drops for listings with at least one drop: 1.8456112852664577
-------
2


0.0    0.711144
1.0    0.174033
2.0    0.064919
3.0    0.026771
4.0    0.012476
5.0    0.005033
6.0    0.002368
7.0    0.001396
Name: n_price_drops, dtype: float64

expected number of price drops: 0.5019242968915204
average number of price drops for listings with at least one drop: 1.7376281112737921
-------
3


0.0    0.736719
1.0    0.164844
2.0    0.057813
3.0    0.025391
4.0    0.008203
5.0    0.003906
6.0    0.001172
7.0    0.000781
Name: n_price_drops, dtype: float64

expected number of price drops: 0.43203125
average number of price drops for listings with at least one drop: 1.6409495548961424
-------


## Time from contract to close

In [56]:
sold_df = final_df[final_df['sale_date'] != 'no_sale']

In [60]:
sold_df['days_in_escrow'] = (sold_df['sale_date'] - sold_df['pending_date']).dt.days

In [64]:
days_in_escrow_metrics_df = sold_df.groupby('buyside_commission_range').agg(
    {'days_in_escrow': ['mean', 'median']}).reset_index()

In [66]:
days_in_escrow_metrics_df

Unnamed: 0_level_0,buyside_commission_range,days_in_escrow,days_in_escrow
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median
0,0,39.946415,35.0
1,1,37.92411,33.0
2,2,40.70093,35.0
3,3,39.67583,34.0
