<a href="https://colab.research.google.com/github/xiaobo-fu/handson-ml/blob/master/02_House_Price_London.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
#import warnings
#warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://data.london.gov.uk/download/average-house-prices/59be940c-ffb8-426d-a833-6146ea77de5c/land-registry-house-prices-ward.csv"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "land-registry-house-prices-ward.csv")
    urllib.request.urlretrieve(housing_url, tgz_path)

In [0]:
fetch_housing_data()

In [0]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "land-registry-house-prices-ward.csv")
    return pd.read_csv(csv_path)

In [0]:
housing = load_housing_data()
housing.head()

Unnamed: 0,Code,Ward_name,Borough,Year,Measure,Value
0,E09000001,City of London,City of London,Year ending Dec 1995,Median,-
1,E05000026,Abbey,Barking and Dagenham,Year ending Dec 1995,Median,53000
2,E05000027,Alibon,Barking and Dagenham,Year ending Dec 1995,Median,45000
3,E05000028,Becontree,Barking and Dagenham,Year ending Dec 1995,Median,49000
4,E05000029,Chadwell Heath,Barking and Dagenham,Year ending Dec 1995,Median,59000


In [0]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168210 entries, 0 to 168209
Data columns (total 6 columns):
Code         168210 non-null object
Ward_name    168210 non-null object
Borough      168210 non-null object
Year         168210 non-null object
Measure      168210 non-null object
Value        168210 non-null object
dtypes: object(6)
memory usage: 7.7+ MB


In [0]:
housing["Year"].value_counts()

Year ending Sep 2013    1890
Year ending Dec 1997    1890
Year ending Jun 1998    1890
Year ending Jun 2001    1890
Year ending Dec 2002    1890
Year ending Jun 2016    1890
Year ending Dec 1999    1890
Year ending Sep 2002    1890
Year ending Dec 2010    1890
Year ending Sep 2003    1890
Year ending Jun 2009    1890
Year ending Dec 2000    1890
Year ending Jun 2007    1890
Year ending Dec 2015    1890
Year ending Sep 1999    1890
Year ending Sep 2008    1890
Year ending Sep 2001    1890
Year ending Jun 2000    1890
Year ending Sep 2017    1890
Year ending Jun 2003    1890
Year ending Dec 2008    1890
Year ending Jun 1996    1890
Year ending Sep 1996    1890
Year ending Sep 2016    1890
Year ending Jun 2010    1890
Year ending Sep 1998    1890
Year ending Jun 2012    1890
Year ending Jun 2004    1890
Year ending Jun 2017    1890
Year ending Dec 2014    1890
                        ... 
Year ending Mar 2017    1260
1997                    1260
2011                    1260
2004          

In [0]:
housing.describe()

Unnamed: 0,Code,Ward_name,Borough,Year,Measure,Value
count,168210,168210,168210,168210,168210,168210
unique,630,607,33,89,3,59768
top,E05000173,Village,Croydon,2007,Mean,250000
freq,267,801,6408,1890,56070,1723


In [0]:
#find keyword replace in column

def delyear(c): 
    for i in range(len(c)):
        if len(c[i]) > 4:
            c[i] = c[i][-4:]
        c[i]=float(c[i])
       

In [0]:
delyear(housing.Year)


In [0]:
housing["Value"].value_counts()

250,000    1723
220,000     498
225,000     478
230,000     465
240,000     461
210,000     441
200,000     437
245,000     432
185,000     378
235,000     376
215,000     367
205,000     353
175,000     352
195,000     342
300,000     339
180,000     336
275,000     325
190,000     317
285,000     313
350,000     306
280,000     294
270,000     283
290,000     281
170,000     272
320,000     271
249,950     267
-           267
179         265
129         264
190         262
           ... 
105,300       1
115,233       1
239,154       1
328,829       1
81,125        1
71,383        1
209,566       1
257,762       1
67,660        1
316,788       1
390,053       1
187,999       1
623,203       1
139,623       1
239,303       1
616,846       1
282,824       1
435,493       1
260,563       1
268,319       1
631,754       1
76,751        1
817,647       1
315,328       1
98,142        1
499,670       1
96,777        1
221,338       1
445,157       1
451,101       1
Name: Value, Length: 597