In [16]:
import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

# Data
* Full list of real estate properties in three counties (Los Angeles, Orange, and Ventura, California). 
* All training transactions occurred before October 15, 2016 (and a few after Oct 15, 2016).
* Testing data are tranactions between October 15 and December 31, 2016.

### Objective
We are supposed to predict 6 time points for all properties (October 2016 (201610), November 2016 (201611), December 2016 (201612), October 2017 (201710), November 2017 (201711), and December 2017 (201712).

Our target variable is the log-error between their Zestimate and the actual sale price. 

`logerror = log(Zestimate) - log(SalePrice)`

This is given to us in the train.csv file.

### Where can we find the data?


Download the data from https://www.kaggle.com/c/zillow-prize-1/data and extract it into a folder called `zillow-prize-1`. This folder should be in the base level directory (the same directory as the .gitignore file)

All features can be found in `properties_2016.csv`

Targets are found in `train_2016.csv` and `train_2017.csv`

In [47]:
features_location = "../zillow-prize-1/properties_2017.csv"
train1_location = "../zillow-prize-1/train_2016_v2.csv"
train2_location = "../zillow-prize-1/train_2017.csv"
test_location = "../zillow-prize-1/sample_submission.csv"


features = spark.read.csv(features_location, header=True)
training1 = spark.read.csv(train1_location, header=True)
training2 = spark.read.csv(train2_location, header=True)
testing = spark.read.csv(test_location, header=True)

### What does our features look like?

In [48]:
features.show(1) # Spark sucks wtf

+--------+---------------------+------------------------+------------+-----------+----------+-------------------+---------------------+-----------------+----------+------------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----+------------+-----------+------------+---------------+--------------+---------------------+--------+----------+-----------------+-------+-----------+------------+-----------+-----------+-------------------------+---------------------+------------------+----------------------+------------+--------------+--------------------+-----------+-------+-----------+-------------------+----------------------+-------+------------------+------------------+---------+---------------+-------------+--------------------------+-----------------+--------------+---------------------+---------+------------------+------------------+-------------------+
|parcelid|airconditioningtypeid|arc

In [49]:
# What is the size of our DataFrame?
print((features.count(), len(features.columns)))

(2985217, 58)


The size of our data is 29.8 million rows and 58 columns.

In [50]:
# Lets see all of the rows.
features.columns

['parcelid',
 'airconditioningtypeid',
 'architecturalstyletypeid',
 'basementsqft',
 'bathroomcnt',
 'bedroomcnt',
 'buildingclasstypeid',
 'buildingqualitytypeid',
 'calculatedbathnbr',
 'decktypeid',
 'finishedfloor1squarefeet',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet13',
 'finishedsquarefeet15',
 'finishedsquarefeet50',
 'finishedsquarefeet6',
 'fips',
 'fireplacecnt',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'hashottuborspa',
 'heatingorsystemtypeid',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'poolcnt',
 'poolsizesum',
 'pooltypeid10',
 'pooltypeid2',
 'pooltypeid7',
 'propertycountylandusecode',
 'propertylandusetypeid',
 'propertyzoningdesc',
 'rawcensustractandblock',
 'regionidcity',
 'regionidcounty',
 'regionidneighborhood',
 'regionidzip',
 'roomcnt',
 'storytypeid',
 'threequarterbathnbr',
 'typeconstructiontypeid',
 'unitcnt',
 'yardbuildingsqft17',
 'yardbuildingsqft26',
 'yearbuilt',
 'numberofstories',
 'firep

### What does our training set look like?

In [59]:
# Concatentate 2016 and 2017
training = training1.union(training2)

In [60]:
training.show(3) # Spark sucks wtf

+--------+--------+---------------+
|parcelid|logerror|transactiondate|
+--------+--------+---------------+
|11016594|  0.0276|     2016-01-01|
|14366692| -0.1684|     2016-01-01|
|12098116|  -0.004|     2016-01-01|
+--------+--------+---------------+
only showing top 3 rows



In [63]:
# What is the size of our DataFrame?
print((training.count(), len(training.columns)))
print("2016 Count: ", (training1.count(), len(training1.columns)))
print("2017 Count: ", (training2.count(), len(training2.columns)))

(167888, 3)
2016 Count:  (90275, 3)
2017 Count:  (77613, 3)


In [62]:
# Lets see all of the rows.
training.columns

['parcelid', 'logerror', 'transactiondate']

## Things to note about training data
We can join the features with the parcelid. 

We are predicting the value of 'logerror'.

We will want to potentially implement timeseries of this (with transactiondate).

# What does our testing set look like?

In [55]:
testing.show(3) # Spark sucks wtf

+--------+------+------+------+------+------+------+
|ParcelId|201610|201611|201612|201710|201711|201712|
+--------+------+------+------+------+------+------+
|10754147|     0|     0|     0|     0|     0|     0|
|10759547|     0|     0|     0|     0|     0|     0|
|10843547|     0|     0|     0|     0|     0|     0|
+--------+------+------+------+------+------+------+
only showing top 3 rows



In [56]:
# What is the size of our DataFrame?
print((testing.count(), len(testing.columns)))

(2985217, 7)


We would want to exclude the testing data. I think?

In [64]:
# Lets see all of the rows.
testing.columns

['ParcelId', '201610', '201611', '201612', '201710', '201711', '201712']

### Things to note about the testing set
1. We will want to exclude all things in the training set. 
2. The ideal `zscore` is 0. 
    * I'm not 100% sure what this means for us...
        * Do we just try to predict the `zscore` in the training set?
  
I'm not sure if this is actually our testing set... hmm