In [5]:
import tensorflow as tf
import numpy as np 
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from IPython.display import clear_output, Image, display, HTML

###### Do not modify here ###### 
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = graph_def
    #strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
    

###### Do not modify  here ######

###### Implement Data Preprocess here ######

# We use python scikit learn LocalOutlierFactor to find the 
# outlier of training data and remove it. (Can reduce error rate)
def remove_outlier(X,Y):
    # remove outliers
    from sklearn.neighbors import LocalOutlierFactor
    from sklearn import svm
    outliers_fraction = 0.25
    clf = LocalOutlierFactor(
            n_neighbors= 85,
            contamination=outliers_fraction)
    y_pred = clf.fit_predict(X)
    scores_pred = clf.negative_outlier_factor_
    index = np.argsort(scores_pred)[:int(len(scores_pred)*0.1)] 

    Xnew = []
    Ynew = []
    for i in range(len(X)):
        if not i in index: 
            Xnew.append(X[i])
            Ynew.append(Y[i])

    return np.array(Xnew), np.array(Ynew).reshape(-1,1)

housing = fetch_california_housing()
m, n = housing.data.shape

# Add bias
housing.data = np.c_[housing.data,np.ones([m,1])]

# 90% for training and 10 % for testing
housing_data_training = housing.data[0:18576]
housing_data_testing = housing.data[18576:]

housing_label_training = housing.target[0:18576].reshape(18576,1)
housing_label_testing = housing.target[18576:]
housing_label_testing = housing_label_testing.reshape(housing_label_testing.shape[0],1)

# Use mean +- 1.5 * Standard Deviation to find the outlier of training data.
top = np.mean(housing_label_training[:,0]) + 1.5 * np.std(housing_label_training[:,0])
buttom = np.mean(housing_label_training[:,0]) - 1.5 * np.std(housing_label_training[:,0])
    
index = []
for i in range(0,18576):
    if housing_label_training[i,0] < buttom or housing_label_training[i,0] > top:
        index.append(i)
print (len(index)) 

# Remove the oulier of training data. (Can reduce error rate.)
for i in range(len(index)-1,-1,-1):
    housing_data_training = np.delete(housing_data_training, (index[i]), axis=0)
    housing_label_training = np.delete(housing_label_training, (index[i]), axis=0)
print(housing_data_training.shape)


1902
(16674, 9)


In [6]:
housing_data_training,housing_label_training = remove_outlier(housing_data_training,housing_label_training)


In [7]:
weight = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(tf.transpose(housing_data_training), housing_data_training)), tf.transpose(housing_data_training)), housing_label_training)
###### Implement Data Preprocess here ######


In [8]:
###### Start TF session ######
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    predict = tf.matmul(housing_data_testing,weight)
    print (predict.eval())
    print (housing_label_testing)
    error_rate = tf.reduce_mean(tf.div(tf.abs(tf.subtract(housing_label_testing, predict)), housing_label_testing))
    print (error_rate.eval())
    show_graph(tf.get_default_graph().as_graph_def())
###### Start TF session ######
###### Meaning of graph ######
#The graph shows the execution flow of tensorflow, the buttom of the graph shows the calculation of 
#weight formulation: tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(tf.transpose(housing_data_training), housing_data_training)), tf.transpose(housing_data_training)), housing_label_training)
#So you can see transpose, Matmul, MatricInverse operation that can be found in the formula.
#MatMul[1-3]means three MatMul function execution(1 and 2 is for calculating weight, 3 is for calculating predict value). 
#After MatMul[1-3], it shows the calculation of error_rate, which is:
#tf.reduce_mean(tf.div(tf.abs(tf.subtract(housing_label_testing, predict)), housing_label_testing))
#So you can see Sub, Abs, Div, Mean in the graph.
###### Meaning of graph ######

[[ 1.85783562]
 [ 1.68001537]
 [ 1.9676785 ]
 ..., 
 [ 0.29953641]
 [ 0.4296291 ]
 [ 0.62960537]]
[[ 1.214]
 [ 1.904]
 [ 1.843]
 ..., 
 [ 0.923]
 [ 0.847]
 [ 0.894]]
0.299478426591
