In [1]:
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import abs as psabs
from pyspark.sql.functions import from_utc_timestamp

In [2]:
df = sqlContext.read.csv('data/nyc/train-dev.csv', header=True, inferSchema=True)

In [3]:
# Using a smaller dataset of 10M rows
df = df.limit(10000000)

In [3]:
df.printSchema()

root
 |-- key: timestamp (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)



In [4]:
# Keep only relevant columns
usecols = [c for c in df.columns if c != 'key']
df = df.select(*usecols)

In [5]:
# Removing outliers - filter fares that are negative or too large.
df = df.filter(df['fare_amount'] > 0)
df = df.filter(df['fare_amount'] <= 250)

In [6]:
# Removing outliers - filter passenger counts that are negative or too large.
df = df.filter(df['passenger_count'] > 0)
df = df.filter(df['passenger_count'] <= 10)

In [7]:
# Cleaning data - NYC lies between 73 and 75 degrees West, and 40 and 42 degrees north.
# Remove trips that have pickup or dropoff coordinates outside NYC.

LEFT, RIGHT, BOTTOM, TOP =  -75, -73, 40, 42
df = df.filter(df['pickup_longitude'] >= LEFT)
df = df.filter(df['pickup_longitude'] <= RIGHT)
df = df.filter(df['pickup_latitude'] <= TOP)
df = df.filter(df['pickup_latitude'] >= BOTTOM)
df = df.filter(df['dropoff_latitude'] >= BOTTOM)
df = df.filter(df['dropoff_latitude'] <= TOP)
df = df.filter(df['dropoff_longitude'] <= RIGHT)
df = df.filter(df['dropoff_longitude'] >= LEFT)

In [8]:
# Approximate the physical distance between pickup and dropoffs using the L1 norm.
# (Remember that Manhattan is in NYC!)

x1 = df['pickup_longitude']
y1 = df['pickup_latitude']
x2 = df['dropoff_longitude']
y2 = df['dropoff_latitude']

df = df.withColumn('l1', psabs(x1 - x2) + psabs(y1 - y2))

In [9]:
# Prices may depend on time of day! (Note that original timestamps are in UTC)
df = df.withColumn('datetime', df['pickup_datetime'].substr(0, 19))

In [10]:
# Convert the timestamps to New York time.
df = df.withColumn('NYTime', from_utc_timestamp('datetime', 'EST'))

In [11]:
# Extract features from the timestamps.

from pyspark.sql.functions import year, month, dayofweek, hour
df = df.withColumn('year', year(df['NYTime']))
df = df.withColumn('month', month(df['NYTime']))
df = df.withColumn('day', dayofweek(df['NYTime']))
df = df.withColumn('hour', hour(df['NYTime']))

In [12]:
df.select('year', 'month', 'day', 'hour', 'NYTime').show()

+----+-----+---+----+-------------------+
|year|month|day|hour|             NYTime|
+----+-----+---+----+-------------------+
|2009|    6|  2|  12|2009-06-15 12:26:21|
|2010|    1|  3|  11|2010-01-05 11:52:16|
|2011|    8|  4|  19|2011-08-17 19:35:00|
|2012|    4|  6|  23|2012-04-20 23:30:42|
|2010|    3|  3|   2|2010-03-09 02:51:00|
|2011|    1|  5|   4|2011-01-06 04:50:45|
|2012|   11|  3|  15|2012-11-20 15:35:00|
|2012|    1|  4|  12|2012-01-04 12:22:00|
|2012|   12|  2|   8|2012-12-03 08:10:00|
|2009|    9|  3|  20|2009-09-01 20:11:00|
|2012|    4|  1|   2|2012-04-08 02:30:50|
|2009|   11|  5|  20|2009-11-05 20:04:03|
|2013|    7|  3|  14|2013-07-02 14:54:00|
|2011|    4|  3|  12|2011-04-05 12:11:05|
|2014|    2|  4|   2|2014-02-19 02:22:00|
|2009|    7|  4|  11|2009-07-22 11:08:00|
|2010|    7|  4|   9|2010-07-07 09:52:00|
|2014|   12|  7|  15|2014-12-06 15:36:22|
|2010|    9|  3|   8|2010-09-07 08:18:00|
|2013|    2|  3|   7|2013-02-12 07:15:46|
+----+-----+---+----+-------------

In [13]:
# Drop rows containing null values
df = df.dropna()

In [14]:
featureCols = [c for c in df.columns if c not in ['datetime', 'NYTime', 'pickup_datetime']]

In [15]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=featureCols, outputCol='features')
df = assembler.transform(df)

In [16]:
train, test = df.randomSplit([0.66, 0.33])

In [17]:
from pyspark.ml.regression import LinearRegression

In [18]:
lr = LinearRegression(featuresCol='features', labelCol='fare_amount')
model = lr.fit(train)
summary = model.evaluate(test)

In [19]:
summary.rootMeanSquaredError

2.561848262197059e-12

In [20]:
summary.r2

1.0