In [99]:
import pandas as pd
pd.options.display.max_rows = 20

from utils import print_summary

TEMPLATE = 'http://data.insideairbnb.com/united-kingdom/england/greater-manchester/{}/data/{}.csv.gz'

MONTHS = '2020-02-18 2020-01-13 2019-12-11 2019-11-16 2019-10-16 2019-09-20 2019-08-17 2019-07-12 2019-06-08 2019-05-14' \
    ' 2019-04-13 2019-03-08 2019-02-07 2019-01-14 2018-12-10 2018-11-09 2018-10-18 2018-09-12 2018-08-14 2018-07-17'.split()


def read_listings(date):
    df = pd.read_csv(TEMPLATE.format(date, 'listings'), 
                     usecols='id last_scraped number_of_reviews room_type host_listings_count availability_365 neighbourhood_cleansed zipcode'.split(),
                     parse_dates=['last_scraped'])
    return df


def read_calendar(date):
    df = pd.read_csv(TEMPLATE.format(date, 'calendar'), 
                     parse_dates=['date'],
                     usecols='date listing_id price'.split()
                    )
    df['last_scraped'] = pd.Timestamp(date)
    df['price'] = df.price.apply(lambda s:float(str(s).replace('$', '').replace(',', '')))
    df['month'] = df.date.apply(lambda d: pd.Timestamp(d.year, d.month, 1, 0))
    return df.groupby('last_scraped listing_id month'.split())['price'].mean().reset_index()

READER = {
    'listings': read_listings,
    'calendar': read_calendar
}

def get_df(filename):
    df_list = []
    reader = READER[filename]
    for date in MONTHS:
        df_list.append(reader(date))
    return pd.concat(df_list, axis=0).reset_index(drop=True)

In [82]:
listings = get_df('listings')
print_summary(listings)

82931 lines - 8 columns
                column |    nulls |   unique | type           | mode/med
--------------------------------------------------------------------------------
                    id |      0   |   9214   |          int64 | 26451916 (20)
          last_scraped |      0   |     23   | datetime64[ns] | 2020-02-18 00:00
   host_listings_count |     73   |    173   |        float64 | 2.0
neighbourhood_cleansed |      0   |     41   |         object | Salford District
               zipcode |   1739   |   4102   |         object | M1 (834)
             room_type |      0   |      4   |         object | Private room (41
      availability_365 |      0   |    366   |          int64 | 0 (15332)
     number_of_reviews |      0   |    447   |          int64 | 0 (15325)


In [102]:
calendar = get_df('calendar')
print_summary(calendar)

1078089 lines - 4 columns
      column |    nulls |   unique | type           | mode/med
--------------------------------------------------------------------------------
last_scraped |      0   |     20   | datetime64[ns] | 2020-02-18 00:00:00 (65546
  listing_id |      0   |   9214   |          int64 | 7231617 (260)
       month |      0   |     32   | datetime64[ns] | 2020-02-01 00:00:00 (58112
       price | 133326   |  40709   |        float64 | 55.0
