https://github.com/ual/rental-listings

This notebook analyzes the rental listings data set, computes several descriptive statistics, and visualizes market characteristics

## Initial setup

In [None]:
# import necessary modules and display matplotlib plots inline within the ipython notebook webpage
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline

In [None]:
def get_colors(cmap, n, start=0., stop=1., alpha=1., reverse=False):
    '''return n-length list of rgba colors from the passed colormap name and alpha,
       limit extent by start/stop values and reverse list order if flag is true'''
    colors = [cm.get_cmap(cmap)(x) for x in np.linspace(start, stop, n)]
    colors = [(r, g, b, alpha) for r, g, b, _ in colors]
    return list(reversed(colors)) if reverse else colors

In [None]:
# define the font styles
family = 'Arial'
title_font = fm.FontProperties(family=family, style='normal', size=18, weight='normal', stretch='normal')
label_font = fm.FontProperties(family=family, style='normal', size=16, weight='normal', stretch='normal')
ticks_font = fm.FontProperties(family=family, style='normal', size=14, weight='normal', stretch='normal')

In [None]:
# function to save images consistently
save_dpi = [96, 300]
def save_fig(fig, title, tight=True):    
    if tight:
        fig.tight_layout()
    for dpi in save_dpi:
        save_folder = 'images/dpi_{}/'.format(dpi)
        fig.savefig(save_folder + title, dpi=dpi)

In [None]:
# load the 2014 census data set of MSAs
census = pd.read_csv('processed-data/census_pop_income.csv')
census['2014_median_income'] = census['2014_median_income'].str.replace(',','').astype(int)
census['2014_pop_est'] = census['2014_pop_est'].str.replace(',','').astype(int)
census = census.drop(labels='notes', axis=1, inplace=False)
census = census.set_index('region')
census.head()

These are regions that either are 1) one of the 50 most populous MSAs or 2) among the top 50 in total listings posted. We used the San Jose-San Francisco-Oakland, CA CSA to accurately represent the region covered by the San Francisco Bay Area and we combined the separate regions for Los Angeles and Orange County into one to accurately represent the area covered by the census bureau’s Los Angeles-Long Beach-Anaheim, CA MSA. Lastly, inlandempire corresponds to Riverside-San Bernardino MSA.

In [None]:
# these are the 15 most populous metros by population, defined by census bureau 2014 estimates
most_populous_regions = census['2014_pop_est'].sort_values(ascending=False, inplace=False).head(15)

In [None]:
regions_full_names = {'newyork':'New York',
                      'northdakota':'North Dakota',
                      'sfbay':'SF Bay Area',
                      'boston':'Boston',
                      'santabarbara':'Santa Barbara',
                      'honolulu':'Honolulu',
                      'newjersey':'New Jersey',
                      'losangeles':'Los Angeles',
                      'orangecounty':'Orange County',
                      'washingtondc':'Washington DC',
                      'ventura':'Ventura',
                      'longisland':'Long Island',
                      'floridakeys':'Florida Keys',
                      'sandiego':'San Diego',
                      'juneau':'Juneau',
                      'philadelphia':'Philadelphia',
                      'chicago':'Chicago',
                      'seattle':'Seattle',
                      'miami':'Miami',
                      'inlandempire':'Inland Empire',
                      'dallas':'Dallas',
                      'houston':'Houston',
                      'phoenix':'Phoenix',
                      'detroit':'Detroit',
                      'atlanta':'Atlanta'}

## Load the full combined data set of rental listings

In [None]:
# function to convert string to float and handle empty string as NaN
def to_float(string_value):
    string_value = string_value.strip()
    return np.float(string_value) if string_value else np.nan

In [None]:
remove_bad_row = pd.read_csv('processed-data/usa.csv')
remove_bad_row = remove_bad_row.drop(labels=4153401)


In [None]:
remove_bad_row.to_csv('processed-data/usa-fixed.csv')

In [None]:
# load the full, combined data set, converting numeric columns to float using our function
converters = {'neighborhood':str, 
              'title':str, 
              'price':to_float, 
              'bedrooms':to_float, 
              'pid':str, 
              'date':str, 
              'link':str, 
              'sqft':to_float, 
              'sourcepage':str, 
              'longitude':to_float, 
              'latitude':to_float}

all_listings = pd.read_csv('processed-data/usa-fixed.csv', converters=converters)

# if not using the fixed csv file, you must drop this row that has a url in its date column and messes up the processing
#all_listings = all_listings.drop(labels=4153401)

In [None]:
all_listings = all_listings.rename(columns={'price':'rent'})

In [None]:
# number of rows in the full data set (includes dupes/re-posts)
all_listings['pid'].count()

There are nearly 11 million listings in the full data set

In [None]:
# calculate rent/sqft
all_listings['rent_sqft'] = all_listings['rent'] / all_listings['sqft']
all_listings[['rent','sqft','rent_sqft']].head()

## Next, parse dates from the date column and visualize the full (may contain re-posts) data set by date

In [None]:
# convert the date column to yyyy-mm-dd date format
all_listings['date'] = pd.to_datetime(all_listings['date'], format='%Y-%m-%d')

In [None]:
# create ticks and tick labels for the time series
listings_per_date = all_listings['date'].value_counts()
listings_per_date = listings_per_date.sort_index()
listings_per_date = listings_per_date.reset_index()
xticks = listings_per_date.iloc[range(0, len(listings_per_date), 7)].index
xtick_labels = listings_per_date.loc[xticks, 'index']
xtick_labels = [str(x).split()[0] for x in xtick_labels]

In [None]:
# plot the total number of listings (includes dupes/re-posts) posted on each day in the data set
ax = listings_per_date.plot(kind='line', figsize=[10, 6], ylim=[0,300000], linewidth=3, 
                            marker='o', markeredgewidth=0, alpha=0.7, color='#003399')
ax.grid(True)
ax.set_title('Total rental listings posted per day', fontproperties=title_font)
ax.set_ylabel('Number of listings posted', fontproperties=label_font)
ax.legend_.remove()

ax.set_xticks(xticks)
ax.set_xticklabels(xtick_labels, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
    label.set_fontproperties(ticks_font)

save_fig(plt.gcf(), 'date_count_listings_posted.png')
plt.show()

This data set covers mid May through mid July, 2014. The x-axis labels represent Sundays.

In [None]:
# calculate day of the week each listing was posted
all_listings['day_of_week'] = all_listings['date'].apply(lambda x: x.weekday())

## Next, de-duplicate the data set based on the 'pid' column and examine the unique vs duplicate listings

In [None]:
# first extract the subdomain/region from the link column
all_listings['region'] = all_listings['link'].str.extract('http://(.*).craigslist.org', expand=False)

In [None]:
# de-dupe data set and create a new dataframe to hold the unique listings
unique_listings = pd.DataFrame(all_listings.drop_duplicates(subset='pid', inplace=False))
len(unique_listings)

In [None]:
# create a view of the duplicate listings
duplicate_listings = all_listings[~all_listings.index.isin(unique_listings.index)]
len(duplicate_listings)

In [None]:
# show the top 5 PIDs with the most duplicates
most_dupe_pids = duplicate_listings['pid'].value_counts()
most_dupe_pids.head()

In [None]:
# examine the listings for the PID with the most duplicates - it is in brooklyn
cols = ['region', 'pid', 'neighborhood', 'rent', 'bedrooms', 'date']
all_listings[all_listings['pid']==most_dupe_pids.index[0]][cols]

It looks like the listing's creator is re-publishing it every couple of days. This seems to maintain the same pid.

In [None]:
# calculate the ratios of unique to duplicate listings for each region
listings_ratios = pd.DataFrame()

# number of total listings for each region
listings_ratios['all_listings'] = all_listings['region'].value_counts()

# number of duplicate listings for each region (ie, listings that share a pid with at least one other listing)
listings_ratios['duplicate_listings'] = duplicate_listings['region'].value_counts()

# number of unique listings for the region (ie, none share a pid with another listing)
listings_ratios['unique_listings'] = unique_listings['region'].value_counts()

# percent of this region's listings that are duplicates
listings_ratios['duplicate_ratio'] = listings_ratios['duplicate_listings'] / listings_ratios['all_listings']

# percent of this region's listings that are unique (ie, not duplicates)
listings_ratios['unique_ratio'] = listings_ratios['unique_listings'] / listings_ratios['all_listings']

listings_ratios.head()

In [None]:
# plot the ratios of unique and duplicate listings, by region
countdata = listings_ratios.sort_values(by='all_listings', ascending=False)[['unique_listings', 'duplicate_listings']].head(20)
countdata.columns = ['Unique Listings', 'Duplicate Listings']
ax = countdata.plot(kind='bar',
                    stacked=True,
                    figsize=[9, 6], 
                    width=0.6, 
                    alpha=0.5, 
                    color=['b','m'],
                    edgecolor='k',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(range(0, len(countdata)))
ax.set_xticklabels(countdata.index, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Unique and duplicate rental listings, by region', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Total number of listings', fontproperties=label_font)        

save_fig(plt.gcf(), 'count_unique_duplicate_listings.png')
plt.show()

In [None]:
# which regions have the largest ratio of duplicate listings
listings_ratios = listings_ratios.sort_values(by='unique_ratio')
listings_ratios.head(10)

## Next, filter the data set by retaining only those rows that contain rent and sqft data

In [None]:
# thorough listings are unique listings with rent and sqft data
thorough_listings = pd.DataFrame(unique_listings)
thorough_listings = thorough_listings[thorough_listings['rent'] > 0]
thorough_listings = thorough_listings[thorough_listings['sqft'] > 0]

In [None]:
# for comparison, what are the counts of the differents sets?
print('Count of all listings:', len(all_listings))
print('Count of unique listings:', len(unique_listings))
print('Count of thorough listings:', len(thorough_listings))

All listings are everything we collected. 

Unique listings are those with a unique pid (ie, re-posts are not counted). 

Thorough listings are unique listings with rent and sq foot data.

In [None]:
# what is the median rent per sqft across the set of thorough listings
thorough_listings['rent_sqft'].median()

In [None]:
# for comparison, the median rent/sqft for the entire, original data set is very similar
all_listings['rent_sqft'].median()

In [None]:
# describe the rent-per-sqft vector
thorough_listings['rent_sqft'].describe()

You can see the mean is pulled up by ridiculous high outliers (like $214 million per sq ft). There are also some ridiculous low outliers. So, let's filter out outliers that fall outside of a reasonable range.

## Filter data set, retaining listings that have reasonable values for rent, sqft, and rent/sqft
Define reasonable by the 0.2 and 99.8 percentiles for rent, sqft, and rent/sqft.

In [None]:
# in this cell, define the values by which we will filter the 3 columns
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(thorough_listings) * upper_percentile)
lower = int(len(thorough_listings) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = thorough_listings['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = thorough_listings['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = thorough_listings['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

In [None]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (thorough_listings['rent_sqft'] > lower_rent_sqft) & (thorough_listings['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (thorough_listings['rent'] > lower_rent) & (thorough_listings['rent'] < upper_rent)
sqft_mask = (thorough_listings['sqft'] > lower_sqft) & (thorough_listings['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
filtered_listings = pd.DataFrame(thorough_listings[rent_sqft_mask & rent_mask & sqft_mask])
len(filtered_listings)

In [None]:
# how many 'unreasonable' listings did we filter out?
count_removed = len(thorough_listings) - len(filtered_listings)
print(count_removed)
print(count_removed / float(len(thorough_listings)))

In [None]:
# save two regional subsets for tract-level analysis
filtered_listings[filtered_listings['region']=='sfbay'].to_csv('sfbay-filtered-listings.csv', index=False, encoding='utf-8')
filtered_listings[filtered_listings['region']=='seattle'].to_csv('seattle-filtered-listings.csv', index=False, encoding='utf-8')
filtered_listings[filtered_listings['region']=='newyork'].to_csv('newyork-filtered-listings.csv', index=False, encoding='utf-8')

So, we filtered out 23,601 or 0.79% of the thorough listings. Although we discarded values below the 0.2% and the 99.8%, we did it on three separate variables (rent, sqft, rent/sqft) so the total number of rows discarded was higher than 0.4%.

In [None]:
# plot the number of filtered rental listings by region
countdata = filtered_listings['region'].value_counts().head(30)
ax = countdata.plot(kind='bar',                 
                    figsize=[10, 6], 
                    width=0.6, 
                    alpha=0.5, 
                    color='g',
                    edgecolor='k',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(range(0, len(countdata)))
ax.set_xticklabels(countdata.index, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('The most rental listings (filtered), by region', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Number of listings per region', fontproperties=label_font)

save_fig(plt.gcf(), 'count_most_listings_filtered.png')
plt.show()

## Let's look at a couple of measures of statistical dispersion to see how the thorough listings differ from this filtered set of reasonable listings

In [None]:
# calculate the interquartile range of the thorough listings
print(thorough_listings['rent_sqft'].describe()['75%'] - thorough_listings['rent_sqft'].describe()['25%'])

# calculate the interquartile range of the filtered listings
print(filtered_listings['rent_sqft'].describe()['75%'] - filtered_listings['rent_sqft'].describe()['25%'])

The interquartile ranges are very similar for each -- filtering didn't change this measure of statistical dispersion much

In [None]:
# calculate the std deviation of the thorough listings
print(thorough_listings['rent_sqft'].std())

# calculate the std deviation of the filtered listings
print(filtered_listings['rent_sqft'].std())

The standard deviation dropped drastically from 125,085.11 to 0.86 after filtering out just 0.79% of the thorough listings that were the greatest outliers.

In [None]:
# look at how descriptive stats changed after filtering by reasonable values
cols = ['rent','sqft','rent_sqft']
c = thorough_listings[cols].describe().rename(columns={'rent':'rent1','sqft':'sqft1','rent_sqft':'rent_sqft1'})
f = filtered_listings[cols].describe().rename(columns={'rent':'rent2','sqft':'sqft2','rent_sqft':'rent_sqft2'})
cf = pd.concat(objs=[c,f], axis=1)
cf = cf.reindex(columns=['rent1','rent2','sqft1','sqft2','rent_sqft1','rent_sqft2'])
cf

Above you can see how the descriptive stats changed in each field from before (the 1s) to after (the 2s) we filtered by reasonable values.

In [None]:
# plot histograms of rent, sqft, and rent/sqft values in the filtered data set
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=[16, 5])
bins = 50
color = 'k'
edgecolor = 'None'
alpha = 0.5
ylim = [0, 660000]

# histogram of rents
ax0 = filtered_listings['rent'].hist(ax=axes[0], bins=bins, color=color, edgecolor=edgecolor, alpha=alpha)
ax0.set_ylim(ylim)
ax0.set_xlabel('Rent', fontproperties=label_font)
ax0.set_ylabel('Listings Count', fontproperties=label_font)

# histogram of sqft
ax1 = filtered_listings['sqft'].hist(ax=axes[1], bins=bins, color=color, edgecolor=edgecolor, alpha=alpha)
ax1.set_ylim(ylim)
ax1.set_xlabel('Square footage', fontproperties=label_font)

# histogram of rent/sqft
ax2 = filtered_listings['rent_sqft'].hist(ax=axes[2], bins=bins, color=color, edgecolor=edgecolor, alpha=alpha)
ax2.set_ylim(ylim)
ax2.set_xlabel('Rent/square foot', fontproperties=label_font)

fig.suptitle('Histograms of rent, square footage, and rent/square foot values in the filtered data set', 
             fontproperties=title_font)

save_fig(plt.gcf(), 'hist_rent_sqft_rentpersqft.png', tight=False)
plt.show()

These histograms show the distribution of these values across the filtered data set. Now plot continuous distributions of rent/sqft use KDE.

In [None]:
# first, sort the 15 most populous metros by median rent/sqft
median_rent_sqft_by_region = filtered_listings.groupby('region')['rent_sqft'].median()
median_rent_sqft_populous_regions = median_rent_sqft_by_region[most_populous_regions.index].sort_values(ascending=False, 
                                                                                                        inplace=False)
metro_names = list(median_rent_sqft_populous_regions.index)

In [None]:
# get a different color for each of the lines to plot
color_list = get_colors('spectral', len(metro_names), start=0, stop=0.9, reverse=True)
#color_list = get_colors('plasma', len(metro_names), start=0.15, stop=0.85, reverse=False)
#color_list = get_colors('viridis', len(metro_names), start=0, stop=0.9, reverse=False)

for name, color in zip(metro_names, color_list):
    values = filtered_listings[filtered_listings['region']==name]['rent_sqft']
    ax = values.plot(kind='kde', color=color, linewidth=2, alpha=0.6, figsize=[10, 7])
    
ax.grid(False)
ax.set_xlim([0,6])
ax.set_ylim([0,2.25])
ax.set_xlabel('Rent per square foot (USD)', fontproperties=label_font)
ax.set_ylabel('Density', fontproperties=label_font)
ax.set_title('Probability density of rent/sqft values for the 15 most populous metros', fontproperties=title_font)
ax.legend([regions_full_names[x] for x in metro_names], prop=ticks_font)

save_fig(plt.gcf(), 'kde_most_populous_metros.png')
plt.show()

Probability densities can exceed 1 because the function is defined over a continuous interval. Probabilities are measured over intervals rather than at single points, so the area beneath the curve between any two points represents the probability for that interval. The integral of the function (the total area under the curve) must equal 1, similar to how the sum of all probabilities in a discrete distribution must equal 1.

There are 415 regions in the entire data set. Let's see the KDEs for all 415 of them.

In [None]:
# first, sort all regions ascending by median rent/sqft to print high values last (ie, on top of the other lines)
region_names = list(median_rent_sqft_by_region.sort_values(ascending=True, inplace=False).index)

# get one color from red through blue (don't use violets/grays) for each line/region
color_list = get_colors('spectral', len(region_names), start=0.13, stop=0.93)
#color_list = get_colors('viridis', len(region_names), start=0, stop=0.9, reverse=True)

for name, color in zip(region_names, color_list):
    values = filtered_listings[filtered_listings['region']==name]['rent_sqft']
    ax = values.plot(kind='kde', color=color, linewidth=.75, alpha=0.5, figsize=[10, 7])
    
ax.grid(False)
ax.set_xlim([0, 4])
ax.set_ylim([0, 6.3])
ax.set_xlabel('Rent per square foot (USD)', fontproperties=label_font)
ax.set_ylabel('Density', fontproperties=label_font)
ax.set_title('Probability density of rent/sqft values for each region', fontproperties=title_font)

save_fig(plt.gcf(), 'kde_all_regions.png')
plt.show()

This is the KDE of rent/sqft for every region in the filtered data set. Each region has its own line, colored by median rent/sqft for that region (high=red, low=blue/violet)

## Analyze the most and least expensive regions in the filtered data set, by price/sqft

In [None]:
# create a categorical variable by dividing the data set into quantiles
num_bins = 5
bin_labels = [ str(n + 1) for n in range(num_bins) ]
quantiles = pd.qcut(x=filtered_listings['rent_sqft'], q=num_bins, labels=bin_labels)
filtered_listings['rent_sqft_cat'] = quantiles

In [None]:
# grab the most and least expensive regions, by median rent per sq ft
most_expensive = median_rent_sqft_by_region.sort_values(ascending=False, inplace=False)
least_expensive = median_rent_sqft_by_region.sort_values(ascending=True, inplace=False)

In [None]:
filtered_median_rent_sqft = filtered_listings['rent_sqft'].median()

In [None]:
# plot the most expensive regions
countdata = most_expensive.head(15)
countdata = countdata.rename({'nd':'northdakota', 'keys':'floridakeys'})
xlabels = [regions_full_names[x] for x in countdata.index]
ax = countdata.plot(kind='bar',                 
                    figsize=[9, 6], 
                    width=0.8, 
                    alpha=0.7, 
                    color='#003399',
                    edgecolor='w',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(range(0, len(countdata)))
ax.set_xticklabels(xlabels, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Most expensive regions, by median rent per square foot', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Median rent per square foot (USD)', fontproperties=label_font)

# draw a line showing the median rent/sqft in the filtered data set
plt.plot([-1, 60], [filtered_median_rent_sqft, filtered_median_rent_sqft], 'k-', color='k', alpha=1, linewidth=1)

save_fig(plt.gcf(), 'median_rent_sqft_most_expensive_regions.png')
plt.show()

The horizontal line depicts the median rent/sqt across the entire filtered data set.

In [None]:
# the least expensive regions are very small
least_expensive.head(10)

In [None]:
# plot the rent per sq ft for the 15 most populous metros in the U.S.
countdata = median_rent_sqft_populous_regions
xlabels = [regions_full_names[x] for x in countdata.index]
ax = countdata.plot(kind='bar',                 
                    figsize=[9, 6], 
                    width=0.8, 
                    alpha=0.7, 
                    color='#003399',
                    edgecolor='w',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(range(0, len(countdata)))
ax.set_xticklabels(xlabels, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Most populous metro areas, by median rent per square foot', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Median rent per square foot (USD)', fontproperties=label_font)

# draw a line showing the median rent/sqft in the filtered data set
plt.plot([-1, 60], [filtered_median_rent_sqft, filtered_median_rent_sqft], 'k-', color='k', alpha=1, linewidth=1)

save_fig(plt.gcf(), 'median_rent_sqft_populous_metros.png')
plt.show()

## Get summary data for regions and analyze affordability

Compare to 2014 American Community Survey 1-Year Estimates of Median household income in the past 12 months (in 2014 inflation-adjusted dollars) (B19013) to assess the rent burden, and to Annual Estimates of the Resident Population 2014 population estimates (as of July 1).

In [None]:
# get the regions that appear in the census data file we loaded at the beginning
# rename orangecounty to losangeles to combine into one to match census metro area
fl_regions = pd.DataFrame(filtered_listings)
fl_regions['region_census'] = fl_regions['region'].map(lambda x: x if not x=='orangecounty' else 'losangeles')
fl_regions = fl_regions[filtered_listings['region_census'].isin(census.index)]

print len(fl_regions['region_census'].value_counts())
print len(fl_regions)
print len(fl_regions) / float(len(filtered_listings))

There are 58 regions in the census data file (out of 415 total in the filtered data set, or 14%), comprising a sample size of 2,297,566 rental listings, or 77.9% of the filtered data set. Also, the fl_regions dataframe combines the losangeles and orangecounty regions into one.

In [None]:
# what is the nationwide median rent and sq ft?
nationwide_median_rent = filtered_listings['rent'].median()
print nationwide_median_rent

nationwide_median_sqft = filtered_listings['sqft'].median()
print nationwide_median_sqft

In [None]:
# calculate some stats on affordability and region summaries
regions = census
regions['count_listings'] = fl_regions.groupby('region_census').size().astype(int)
regions['median_rent'] = fl_regions.groupby('region_census')['rent'].median().astype(int)
regions['median_sqft'] = fl_regions.groupby('region_census')['sqft'].median().astype(int)
regions['median_rent_sqft'] = fl_regions.groupby('region_census')['rent_sqft'].median().round(2)
regions['rent_proportion'] = (regions['median_rent'] / (regions['2014_median_income'] / 12)).round(2)
regions['rental_power'] = (nationwide_median_rent / regions['median_rent_sqft']).astype(int)

In [None]:
# save to csv and display the head
regions = regions.sort_values(by='2014_pop_est', ascending=False, inplace=False)
regions.to_csv('processed-data/regions_census_summary.csv', index=True)
regions.head()[['2014_median_income','2014_pop_est','count_listings','median_rent','median_sqft',
                'median_rent_sqft','rent_proportion','rental_power']]

2014 median income is from the 2014 American Community Survey’s 1-year estimates of median household income (in 2014 inflation-adjusted dollars). 2014 population estimates are from the American Community Survey’s 2014 annual estimates of resident population (as of July 1). Median rent, sqft, and rent/sqft are calculated from the filtered data set (note: median rent/sqft is not equivalent to median rent/median sqft). The rent proportion is the ratio of median rent to median monthly household income. Sqft rental power is an estimate of how many square feet can be rented in each region for the nationwide median rent, and is calculated by dividing nationwide median rent by regional median rent per sqft.

In [None]:
# look at some simple correlations between these variables
xs = ['2014_pop_est', '2014_median_income', 'count_listings', '2014_median_income']
ys = ['count_listings', 'count_listings', 'median_rent_sqft', 'median_rent_sqft']
for x, y in zip(xs, ys):
    print '{} vs {}'.format(y, x)
    r, p = pearsonr(regions[x], regions[y])
    print 'r={:0.3f}, p={:0.3f}'.format(r, p)
    regions.plot(kind='scatter', x=x, y=y)
    plt.show()

In [None]:
# plot the rent burden for the 15 most populous metros in the U.S.
countdata = regions['rent_proportion'][most_populous_regions.index].sort_values(ascending=False, inplace=False)
xlabels = [regions_full_names[x] for x in countdata.index]
ax = countdata.plot(kind='bar',                 
                    figsize=[9, 6], 
                    width=0.8, 
                    alpha=0.7, 
                    color='#003399',
                    edgecolor='w',
                    ylim=[0, 0.47],
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(range(0, len(countdata)))
ax.set_xticklabels(xlabels, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Most populous metro areas, by proportion of income spent on rent', fontproperties=title_font)
ax.set_xlabel('')
ax.set_ylabel('Ratio of median rent to median household income', fontproperties=label_font)

# draw a line showing the rent burden
plt.plot([-1, 60], [0.3, 0.3], 'k-', color='k', alpha=1, linewidth=1)

save_fig(plt.gcf(), 'rent_proportion.png')
plt.show()

0.3 is the common threshold for rent burden. Here we can see 5 of the 15 most populous regions' median rents exceed 30% of the metro areas' median monthly household income.

Based on each region's median rent/sqft, how many square feet can you rent in each of the 15 most populous metro areas for the nationwide median rent of $1,145?

In [None]:
# plot the sqft you can buy for the nationwide median rent for the 15 most populous metros in the U.S.
countdata = countdata = regions['rental_power'][most_populous_regions.index].sort_values(ascending=False, inplace=False)
xlabels = [regions_full_names[x] for x in countdata.index]
ax = countdata.plot(kind='bar',                 
                    figsize=[9, 6], 
                    width=0.8, 
                    alpha=0.7, 
                    color='#003399',
                    edgecolor='w',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(range(0, len(countdata)))
ax.set_xticklabels(xlabels, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Sq ft one can rent in most populous metros, for nationwide median rent', y=1.02, fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Square feet', fontproperties=label_font)

# draw a line showing the nationwide median sqft
plt.plot([-1, 16], [nationwide_median_sqft, nationwide_median_sqft], 'k-', color='k', alpha=1, linewidth=1)

save_fig(plt.gcf(), 'rental_power.png')
plt.show()

The horizontal line depicts the nationwide median square footage in the filtered data set.

## Compare rental listings by regions and # bedrooms to HUD fair market rents

Determine the ratio of listings below the fair market rent, as defined by HUD. FMRs generally correspond to 40th percentile rents and FMR areas generally correspond to metropolitan areas, but inconsistently so as HUD uses a more complicated formula to determine percentiles and area boundaries in different circumstances. Dallas is excluded here as the Dallas metro FMR area uses only disaggregate "Small Area FMRs" as defined by ZIP codes.

In [None]:
# load the HUD 2014 median rents and fair market rents data (per region and per # of bedrooms)
hud = pd.read_csv('processed-data/hud_frm_median_rent_metro_bedrooms.csv', index_col='region')

In [None]:
# get all the filtered listings for our regions, that have 1-4 bedrooms
reg_rent = filtered_listings[filtered_listings['region'].isin(hud.index)][['region', 'bedrooms', 'rent']]
reg_rent.index = [reg_rent['region'], reg_rent['bedrooms']]
reg_rent = reg_rent[reg_rent['bedrooms'].isin([1,2,3,4])]
reg_rent.sort_index(inplace=True) #sort index for faster performance

In [None]:
# assign the fair market rent value (determined by region and # of bedrooms) to each row in the dataframe
reg_rent['fmr'] = None
for name in hud.index:
    for br in [1,2,3,4]:
        reg_rent.loc[(name, br), 'fmr'] = hud.loc[name, 'FMR_{0}'.format(br)]

In [None]:
# what is the ratio of listings below FMR to total listings, in the entire filtered data set
reg_rent['below_fmr'] = reg_rent['rent'] <= reg_rent['fmr']
reg_rent_vc = reg_rent['below_fmr'].value_counts()
fmr_ratio = reg_rent_vc[True] / float(reg_rent_vc.sum())
fmr_ratio

37% of the listings in the filtered data set are at/below fair market rent for that region and number of bedrooms.

In [None]:
# break out the proportion of listings below FMR, by bedrooms (agnostic to region)
reg_rent_below = reg_rent[reg_rent['below_fmr']]
below_FMR_br = reg_rent_below.groupby(['region', 'bedrooms']).count()['below_fmr'].unstack().sum()
total_br = reg_rent.groupby(['region', 'bedrooms']).count()['below_fmr'].unstack().sum()
fmr_ratio_by_br = below_FMR_br / total_br
fmr_ratio_by_br.index = [int(label) for label in fmr_ratio_by_br.index]
fmr_ratio_by_br.loc['all_1-4'] = fmr_ratio
fmr_ratio_by_br.name = 'total'
fmr_ratio_by_br

Broken out by number of bedrooms, 29% of the 1 bedroom listings are below FMR, 36% of the 2 bedrooms, 51% of the 3 bedrooms, and 45% of the 4 bedrooms.

In [None]:
# break out the proportion of listings below FMR, by region
reg_rent_below = reg_rent[reg_rent['below_fmr']]
ratio_below_fmr = reg_rent_below.groupby('region').count()['below_fmr'] / reg_rent.groupby('region').count()['below_fmr']
ratio_below_fmr.name = 'all_1-4'

In [None]:
# break out the proportion of listings below FMR, by region and bedrooms
ratio_below_fmr_br = reg_rent_below.groupby(['region','bedrooms']).count()['below_fmr'] / reg_rent.groupby(['region','bedrooms']).count()['below_fmr']
ratio_below_fmr_br = ratio_below_fmr_br.unstack()
ratio_below_fmr_br.columns = [int(label) for label in ratio_below_fmr_br.columns]
ratio_below_fmr_br = pd.concat([ratio_below_fmr_br, ratio_below_fmr], axis=1)
ratio_below_fmr_br.head(6)

The proportion of listings at/below the FMR varies considerably by region and by number of bedrooms.

In [None]:
# add the totals to the bottom of the dataframe, round it, and save to csv
ratio_below_fmr_br = ratio_below_fmr_br.append(fmr_ratio_by_br)
np.round(ratio_below_fmr_br, 2).to_csv('processed-data/regions_fmr_summary.csv')

In [None]:
# plot the proportion of listings at/below FMR for the 15 most populous metros
countdata = ratio_below_fmr_br['all_1-4'][most_populous_regions.index].sort_values(ascending=False, inplace=False)
countdata = countdata.drop(labels='dallas', axis=0)
xlabels = [regions_full_names[x] for x in countdata.index]
ax = countdata.plot(kind='bar',                 
                    figsize=[9, 6], 
                    width=0.8, 
                    alpha=0.7, 
                    color='#003399',
                    edgecolor='w',
                    ylim=[0, 0.8],
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(range(0, len(countdata)))
ax.set_xticklabels(xlabels, rotation=40, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Most populous metros, by proportion of listings below fair market rent', y=1.01, fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Proportion of listings', fontproperties=label_font)

# draw a line showing the rent burden
plt.plot([-1, 60], [0.4, 0.4], 'k-', color='k', alpha=1, linewidth=1)

save_fig(plt.gcf(), 'fmr_proportions.png')
plt.show()

The 14 most populous metro areas (i.e. the 15, sans Dallas, for whom there is no metro-level FMR data) by proportion of listings in the filtered data set at or below the HUD fair market rent value. The horizontal line marks the 40th percentile: for reference, HUD bases their FMRs on the 40th percentile rent.

While regions like Phoenix, Atlanta, and Detroit have greater than 60% of their listings below the fair market rent, New York and Boston have single digit percentages of listings below the fair market rent.

## Validate the data set against HUD median rents by region

In [None]:
# get median rent per metro per # of bedrooms (1-4)
mask = filtered_listings['region'].isin(regions.sort_index(inplace=False).index) & filtered_listings['bedrooms'].isin([1,2,3,4])
region_br_rent = filtered_listings[mask].groupby(['region', 'bedrooms'])['rent'].median().unstack()
region_br_rent.columns = ['clist_{0}'.format(br) for br in pd.Series(region_br_rent.columns).astype(int)]

region_br_rent.head()

In [None]:
# join the Craigslist median rents and the HUD median rents
region_hud = pd.concat([region_br_rent, hud], axis=1)

To assess the relationship between the Craigslist median rents and the HUD median rents (by region), first scatter plot them. HUD median rents are calculated for "fair market rent areas" that with a few exceptions generally correspond to OMB definitions of metropolitan areas, as these generally correspond well to housing market areas. However, these are median rent values, not FMRs here.

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(7, 7)

labels = ['1 br', '2 br', '3 br', '4 br']
plots = []
plots.append(ax.scatter(x=region_hud['HUD_median_1'], y=region_hud['clist_1'], c='g', edgecolor='k', alpha=.4, s=50))
plots.append(ax.scatter(x=region_hud['HUD_median_2'], y=region_hud['clist_2'], c='b', edgecolor='k', alpha=.4, s=50))
plots.append(ax.scatter(x=region_hud['HUD_median_3'], y=region_hud['clist_3'], c='m', edgecolor='k', alpha=.4, s=50))
plots.append(ax.scatter(x=region_hud['HUD_median_4'], y=region_hud['clist_4'], c='orange', edgecolor='k', alpha=.4, s=50))

ax.set_xlim([0,4100])
ax.set_ylim([0,4100])

ax.set_title('Craigslist median rent vs HUD median rent, by metro area', fontproperties=title_font)
ax.set_xlabel('HUD median rent by metro area (USD)', fontproperties=label_font)
ax.set_ylabel('Craigslist median rent by metro area (USD)', fontproperties=label_font)
plt.legend(plots, labels, loc=4, prop=ticks_font)

# draw a line indicating a perfect linear relationship
plt.plot([0, 4100], [0, 4100], 'k-', color='k', alpha=0.2, linewidth=1.5)

save_fig(plt.gcf(), 'median_rent_hud_craigslist.png')
plt.show()

Points are above the line when Craigslist median rent is greater than HUD median rent, below the line when HUD median rent is greater than Craigslist median rent, and on the line when the two median rents are equal.

In [None]:
# plot same data, but with simple bivariate regression lines
fig, ax = plt.subplots()
fig.set_size_inches(7, 7)
bedrooms = [1, 2, 3, 4]
labels = ['1 br', '2 br', '3 br', '4 br']
color_list = get_colors('YlOrRd', n=len(labels), start=0.25, stop=0.95)
plots = []

for br, c in zip(bedrooms, color_list):
    
    # regress craigslist data on HUD data
    X = region_hud['HUD_median_{}'.format(br)]
    Y = region_hud['clist_{}'.format(br)]
    results = sm.OLS(Y, sm.add_constant(X)).fit()
    
    # calculate estimated y values for regression line
    X_line = pd.Series(X)
    X_line.loc[0] = 0
    X_line.loc[4100] = 4100
    Y_est = X_line * results.params[1] + results.params[0]
    
    # draw points and regression line
    plots.append(ax.scatter(X, Y, c=c, edgecolor='#333333', alpha=0.8, s=40, zorder=2))
    ax.plot(X_line, Y_est, c=c, alpha=0.5, linewidth=2, zorder=1)

ax.set_xlim([0,4100])
ax.set_ylim([0,4100])

ax.set_title('Craigslist median rent vs HUD median rent, by metro area', fontproperties=title_font)
ax.set_xlabel('HUD median rent by metro area (USD)', fontproperties=label_font)
ax.set_ylabel('Craigslist median rent by metro area (USD)', fontproperties=label_font)
plt.legend(plots, labels, loc=4, prop=ticks_font)

save_fig(plt.gcf(), 'median_rent_hud_craigslist_regression.png')
plt.show()

In [None]:
# now get the correlation coefficient and statistical significance for each number of bedrooms
N = len(region_hud)
for br in [1, 2, 3, 4]:
    r, p = pearsonr(region_hud['clist_{}'.format(br)], region_hud['HUD_median_{}'.format(br)])
    r_square = r ** 2
    t = r * (np.sqrt((N - 2)/(1 - r_square)))
    
    print '{} br:'.format(br), 'r={:0.2f},'.format(r), 'r-square={:.2f},'.format(r_square),
    print 't={:05.2f},'.format(t), 'df={},'.format(N-2), 'p={:0.23f}'.format(p)

The correlations between HUD and Craigslist median rents are positive, strong, and statistically significant (p<.0001). The coefficient of determinations (r<sup>2</sup>) reveal that 83%, 81%, 77%, and 63% (for 1, 2, 3, and 4 bedroom listings, respectively) of the variation in Craigslist median rents (per region) can be explained by HUD median rents.

Now perform a dependent samples t-test for each number of bedrooms to compare the Craigslist and HUD means (of median rents by region) to see if they are significantly different from each other.

In [None]:
# dependent samples t-test to see if means are significantly different
for br in [1, 2, 3, 4]:
    t, p = ttest_rel(region_hud['clist_{}'.format(br)], region_hud['HUD_median_{}'.format(br)])
    print br, 'br:', 't={},'.format(round(t, 2)), 'p={}'.format(round(p, 3))

The null hypothesis H<sub>0</sub> is that the means are the same. We can reject the null for 1 br (p<.01) and 3 br (p<.02), indicating that the means of Craigslist and HUD are statistically significantly different. We cannot reject the null for 2 br (p=.07) or 4 br (p=.69), indicating that the means of Craigslist and HUD are not statistically significantly different (ie, we would expect a t-statistic of this size 7% and 69% of the time when there is no real difference between the population means).

Two-sample t-tests require that a set of conditions be met. First, each sample must be simple random sampling - ours aren't exactly that. Second, the sampling distribution should be normal - ie, symmetric, unskewed, and without outliers. None of these samples are normally distributed - that's to be expected with real world data. But most of these samples are considerably positively skewed by outliers, so the t-test may not really be appropriate here.

Instead, let's try to get at the degree and direction of bias of Craigslist median rents with regards to HUD median rents by examining ratios.

In [None]:
# now calculate the ratio of median rents in filtered data set (per region and per # of bedrooms) to HUD median rents
region_hud['hud_ratio_1'] = region_hud['clist_1'] / region_hud['HUD_median_1']
region_hud['hud_ratio_2'] = region_hud['clist_2'] / region_hud['HUD_median_2']
region_hud['hud_ratio_3'] = region_hud['clist_3'] / region_hud['HUD_median_3']
region_hud['hud_ratio_4'] = region_hud['clist_4'] / region_hud['HUD_median_4']

region_hud_means = region_hud[['hud_ratio_1','hud_ratio_2','hud_ratio_3','hud_ratio_4']].mean()
region_hud_means

On average (arithmetic mean) in these regions, median rents in the filtered data set are 7.5% higher for 1 bedroom, 3.2% higher for 2 bedrooms, 7.2% lower for 3 bedrooms, and 1.2% higher for 4 bedrooms than the HUD 2014 median rent.

In [None]:
# add the mean values to the bottom then format all the ratios as +/- percentages
region_hud_means.name='means'
region_hud = region_hud.append(region_hud_means)
cols = ['hud_ratio_1','hud_ratio_2','hud_ratio_3','hud_ratio_4']
region_hud[cols] = region_hud[cols].applymap(lambda x: round(x, 2)).values
region_hud[cols].tail()

In [None]:
# save to csv and remove the means row
region_hud.to_csv('processed-data/regions_hud_summary.csv')
region_hud = region_hud.drop(labels='means', axis=0)

## Next, analyze listings counts and median rent per sq ft, by day of the week

In [None]:
days_of_the_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [None]:
# how many times does each day of the week appear in the data set of filtered listings
listings_per_day = filtered_listings.groupby('day_of_week').size()
listings_per_day.index = days_of_the_week
listings_per_day

In [None]:
# how many times does each day of the week appear in the data set of filtered listings
listings_per_date = pd.DataFrame(filtered_listings['date'].value_counts())
listings_per_date['day_of_week'] = listings_per_date.index.weekday
day_counts = listings_per_date['day_of_week'].value_counts().sort_index()
day_counts.index = days_of_the_week
day_counts

In [None]:
# how many filtered listings per day of the week normalized by how many times that day appears in the data set
avg_listings_per_day = listings_per_day / day_counts
avg_listings_per_day.name = 'avg_count_filtered'

In [None]:
# what is the median rent per day of the week
median_rent_per_day = filtered_listings.groupby('day_of_week')['rent_sqft'].median().sort_index()
median_rent_per_day.index = days_of_the_week
median_rent_per_day.name = 'median_rent_filtered'

In [None]:
# display a summary of the filtered data set, by day of the week
day_summaries = pd.concat(objs=[avg_listings_per_day, median_rent_per_day], axis=1)
day_summaries

The average number of listings posted and the median rent per square foot, per day of the week (filtered data set)

In [None]:
# for comparison, create the same dataframe above, but for the original thorough set of listings
all_listings_per_day = all_listings.groupby(all_listings['day_of_week']).size()
all_listings_per_day.index = days_of_the_week
all_listings_per_date = pd.DataFrame(all_listings['date'].value_counts())
all_listings_per_date['day_of_week'] = all_listings_per_date.index.weekday
all_day_counts = all_listings_per_date['day_of_week'].value_counts().sort_index()
all_day_counts.index = days_of_the_week
all_avg_listings_per_day = all_listings_per_day / all_day_counts
all_avg_listings_per_day.name = 'avg_count_original'
all_median_rent_per_day = all_listings.groupby(all_listings['day_of_week'])['rent_sqft'].median().sort_index()
all_median_rent_per_day.index = days_of_the_week
all_median_rent_per_day.name = 'median_rent_original'
all_day_summaries = pd.concat(objs=[all_avg_listings_per_day, all_median_rent_per_day], axis=1)

In [None]:
# compare the daily summaries from the original thorough data set, to those of the filtered set
combined_summaries = pd.concat(objs=[day_summaries, all_day_summaries], axis=1)
combined_summaries['count_ratio'] = combined_summaries['avg_count_filtered'] / combined_summaries['avg_count_original']
combined_summaries['rent_ratio'] = combined_summaries['median_rent_filtered'] / combined_summaries['median_rent_original']
combined_summaries = combined_summaries.reindex(columns=['avg_count_filtered','avg_count_original','count_ratio',
                                                         'median_rent_filtered','median_rent_original','rent_ratio'])
combined_summaries

The average number of listings posted and the median rent per square foot, by day of the week, for the original thorough data set and the filtered data set. Ratios show the ratio of the filtered set's value to the original set's value.

Tuesdays have a noticeably higher ratio of unique, reasonable rental listings posted compared to Mondays. Explore that further, below.

In [None]:
# for more comparison, create the same dataframe as earlier, but for the unique set of listings, pre-filter
unique_listings_per_day = unique_listings.groupby(unique_listings['day_of_week']).size()
unique_listings_per_day.index = days_of_the_week
unique_listings_per_date = pd.DataFrame(unique_listings['date'].value_counts())
unique_listings_per_date['day_of_week'] = unique_listings_per_date.index.weekday
unique_day_counts = unique_listings_per_date['day_of_week'].value_counts().sort_index()
unique_day_counts.index = days_of_the_week
unique_avg_listings_per_day = unique_listings_per_day / unique_day_counts
unique_avg_listings_per_day.name = 'avg_count_unique'
unique_median_rent_per_day = unique_listings.groupby(unique_listings['day_of_week'])['rent_sqft'].median().sort_index()
unique_median_rent_per_day.index = days_of_the_week
unique_median_rent_per_day.name = 'median_rent_unique'
unique_day_summaries = pd.concat(objs=[unique_avg_listings_per_day, unique_median_rent_per_day], axis=1)

In [None]:
# look at the ratios (original, unique, and filtered) side by side
all_ratios = combined_summaries['avg_count_original'] / combined_summaries['avg_count_original'].sum()
unique_ratios = unique_day_summaries['avg_count_unique'] / unique_day_summaries['avg_count_unique'].sum()
filtered_ratios = combined_summaries['avg_count_filtered'] / combined_summaries['avg_count_filtered'].sum()

avg_count_ratios = pd.concat(objs=[all_ratios, unique_ratios, filtered_ratios], axis=1)
avg_count_ratios = avg_count_ratios.rename(columns={'avg_count_original':'original', 
                                                    'avg_count_unique':'unique', 
                                                    'avg_count_filtered':'filtered'})
avg_count_ratios

In [None]:
# plot the ratios of rental listings (original, unique, and filtered) by day of week
countdata = avg_count_ratios
ax = countdata.plot(kind='bar',                 
                    figsize=[8, 6], 
                    ylim=[0,.2],
                    width=0.6, 
                    alpha=0.5,
                    color=['r','b','g'],
                    edgecolor='gray',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(map(lambda x: x, range(0, len(countdata))))
ax.set_xticklabels(countdata.index, rotation=35, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Each day of the week\'s ratio of total rental listings posted', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Ratio of listings posted per day', fontproperties=label_font)

save_fig(plt.gcf(), 'day_of_week_ratio_listings_posted.png')
plt.show()

Here it is easy to see that Mondays account for a greater proportion of posted rental listings before we filter the data set for duplicates/re-posts and reasonable values. In contrast, Tuesdays account for a greater proportion of the listings after we filter the data set. It seems that Mondays suffer from more low quality postings, and Tuesdays have a greater ratio of high quality postings.

In [None]:
# plot the avg number of filtered rental listings, by day of week
countdata = avg_listings_per_day
ax = countdata.plot(kind='bar',                 
                    figsize=[8, 6], 
                    width=0.6, 
                    alpha=0.6,
                    color='g',
                    edgecolor='gray',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(map(lambda x: x, range(0, len(countdata))))
ax.set_xticklabels(countdata.index, rotation=35, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Filtered rental listings posted, by day of the week', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Mean listings posted per day', fontproperties=label_font)

save_fig(plt.gcf(), 'day_of_week_listings_count_posted_filtered.png')
plt.show()

Sundays see only half as many (filtered) listings posted as Mondays and Tuesdays do.

## Now look at median rent/sqft by day of the week

In [None]:
# look at the median rent/sqft (original, unique, and filtered) side by side
all_rent = combined_summaries['median_rent_original']
unique_rent = unique_day_summaries['median_rent_unique']
filtered_rent = combined_summaries['median_rent_filtered']

median_rents = pd.concat(objs=[all_rent, unique_rent, filtered_rent], axis=1)
median_rents = median_rents.rename(columns={'median_rent_original':'original', 
                                                    'median_rent_unique':'unique', 
                                                    'median_rent_filtered':'filtered'})
median_rents

In [None]:
# plot the median rent/sqft (original, unique, and filtered) by day of week
countdata = median_rents
ax = countdata.plot(kind='bar',                 
                    figsize=[8, 6], 
                    ylim=[0, 1.4],
                    width=0.6, 
                    alpha=0.5,
                    color=['r','b','g'],
                    edgecolor='gray',
                    grid=False)

ax.yaxis.grid(True)
ax.set_xticks(map(lambda x: x, range(0, len(countdata))))
ax.set_xticklabels(countdata.index, rotation=35, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Median rent per square foot, by day of the week posted', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Median rent per square foot (USD)', fontproperties=label_font)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc='upper left')

save_fig(plt.gcf(), 'day_of_week_median_rent_sqft.png')
plt.show()

In [None]:
# plot the median rent per sq ft by day of the week for the filtered data set only
countdata = median_rent_per_day
ax = countdata.plot(kind='bar',                 
                    figsize=[8, 6], 
                    width=0.6, 
                    alpha=0.7,
                    color='g',
                    edgecolor='gray',
                    grid=False,
                    ylim=[0, 1.4])

ax.yaxis.grid(True)
ax.set_xticks(map(lambda x: x, range(0, len(countdata))))
ax.set_xticklabels(countdata.index, rotation=35, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)
ax.set_title('Median rent per square foot, by day of the week', fontproperties=title_font)
ax.set_xlabel('', fontproperties=label_font)
ax.set_ylabel('Median rent per square foot (USD)', fontproperties=label_font)

save_fig(plt.gcf(), 'day_of_week_median_rent_sqft_filtered.png')
plt.show()

Median rents are about 11.5% higher on Sundays (the most expensive day) than they are on Wednesdays (the least expensive day)

## Retain only the rows with lat-long data, then show descriptive stats for the different stages of the data set

In [None]:
# clean data further by only retaining rows with lat-long data
geolocated_filtered_listings = pd.DataFrame(filtered_listings)
geolocated_filtered_listings = geolocated_filtered_listings[pd.notnull(geolocated_filtered_listings['latitude'])]
geolocated_filtered_listings = geolocated_filtered_listings[pd.notnull(geolocated_filtered_listings['longitude'])]

print len(geolocated_filtered_listings)
print len(geolocated_filtered_listings) / float(len(filtered_listings))

There are 1,456,338 geolocated listings in the filtered data set.

To recap:

- There were 10,958,372 rental listings in the original, full data set.
- Of those total listings, 5,480,435 or 50.0% were unique.
- Of those unique listings, 2,947,761 or 53.8% had rent, sqft, and reasonable values.
- Of those filtered listings, 1,456,338 or 49.4% were geolocated.

Interestingly, each filtering step retained almost exactly half of the remaining data set.

In [None]:
# how many regions are in the data set?
print len(all_listings['region'].unique())
print len(filtered_listings['region'].unique())

In [None]:
print len(all_listings)
all_listings.describe()

In [None]:
print len(unique_listings)
unique_listings.describe()

In [None]:
print len(thorough_listings)
thorough_listings.describe()

In [None]:
print len(filtered_listings)
filtered_listings.describe()

In [None]:
print len(geolocated_filtered_listings)
geolocated_filtered_listings.describe()

## Finally, save the geolocated filtered data to CSV for GIS mapping

In [None]:
# only retain the relevant columns, then save the dataframe to csv
cols = ['pid', 'date', 'region', 'neighborhood', 'rent', 'bedrooms', 'sqft', 'rent_sqft', 
        'rent_sqft_cat', 'longitude', 'latitude']
data_output = geolocated_filtered_listings[cols]
data_output.to_csv('processed-data/geolocated_filtered_listings.csv', index=False)

In [None]:
# also save a minimized csv with only category, lat, and long
min_cols = ['rent_sqft_cat', 'longitude', 'latitude']
data_output_min = geolocated_filtered_listings[min_cols]
data_output_min.to_csv('processed-data/geolocated_filtered_listings_min.csv', index=False)