# Using SPARK map & reduce methods with Uber-Jan-Feb-FOIL.csv  for basic analysis as pandas dataframe ones

# -MapReduce
# -Pandas dataframe
# -PySpark SQL

- ref https://www.supergloo.com/fieldnotes/apache-spark-quick-start-with-python-new-york-city-uber-trips/

In [3]:
!python --version
! date

Python 3.4.5 :: Continuum Analytics, Inc.
Thu Nov  3 18:57:45 CST 2016


In [1]:
sc

<pyspark.context.SparkContext at 0x1047e5940>

In [2]:
from pyspark.sql import SQLContext
import datetime as dt   
import time
import pandas as pd, numpy as np
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline
import urllib

Populating the interactive namespace from numpy and matplotlib




#  Load csv as RDD

In [9]:
# access DATA here https://github.com/yennanliu/uber-tlc-foil-response

ut = sc.textFile("uber-tlc-foil-response-master/Uber-Jan-Feb-FOIL.csv")

In [12]:
type(ut)

pyspark.rdd.RDD

In [11]:
ut.count()

355

In [14]:
ut.take(4)

['dispatching_base_number,date,active_vehicles,trips',
 'B02512,1/1/2015,190,1132',
 'B02765,1/1/2015,225,1765',
 'B02764,1/1/2015,3427,29421']

# Split rows 

### 'dispatching_base_number,date,active_vehicles,trips' is recognized as ONE object and split up with  'B02512,1/1/2015,190,1132' by ","

In [15]:
rows = ut.map(lambda line: line.split(","))

In [26]:
rows.take(5)

[['dispatching_base_number', 'date', 'active_vehicles', 'trips'],
 ['B02512', '1/1/2015', '190', '1132'],
 ['B02765', '1/1/2015', '225', '1765'],
 ['B02764', '1/1/2015', '3427', '29421'],
 ['B02682', '1/1/2015', '945', '7679']]

In [61]:
# if only want numeric part 

ut.filter(lambda x : "base" not in x ).map(lambda line: line.split(",")).take(5)

[['B02512', '1/1/2015', '190', '1132'],
 ['B02765', '1/1/2015', '225', '1765'],
 ['B02764', '1/1/2015', '3427', '29421'],
 ['B02682', '1/1/2015', '945', '7679'],
 ['B02617', '1/1/2015', '1228', '9537']]

In [17]:
type(rows)

pyspark.rdd.PipelinedRDD

# Get column distinct elements in column 0 

In [28]:
# collect distinct elements in column 0 
rows.map(lambda row: row[0]).distinct().collect()

['B02598',
 'B02682',
 'dispatching_base_number',
 'B02765',
 'B02617',
 'B02764',
 'B02512']

In [22]:
#  how many "B02617" in rows 

rows.filter(lambda row: "B02617" in row).count()

59

In [47]:
# count with conditions 
# only how many "B02617" with trips > 15000

base02617 = rows.filter(lambda row: "B02617" in row)
base02617.filter(lambda row: int(row[3]) > 15000).map(lambda day: day[1]).distinct().count()

6

# ReduceByKey

In [51]:
# number of active_vehicles per base station

filteredRows  = ut.filter(lambda line: "base" not in line).map(lambda line:line.split(","))
filteredRows.map(lambda kp: (kp[0], int(kp[2])) ).reduceByKey(lambda k,v: k + v).collect()

[('B02598', 58653),
 ('B02682', 71431),
 ('B02765', 22575),
 ('B02617', 79758),
 ('B02764', 217290),
 ('B02512', 13125)]

In [54]:
# number of trips per base station

filteredRows  = ut.filter(lambda line: "base" not in line).map(lambda line:line.split(","))
filteredRows.map(lambda kp: (kp[0], int(kp[3])) ).reduceByKey(lambda k,v: k + v).collect()

[('B02598', 540791),
 ('B02682', 662509),
 ('B02765', 193670),
 ('B02617', 725025),
 ('B02764', 1914449),
 ('B02512', 93786)]

In [57]:
filteredRows.take(3)

[['B02512', '1/1/2015', '190', '1132'],
 ['B02765', '1/1/2015', '225', '1765'],
 ['B02764', '1/1/2015', '3427', '29421']]

In [62]:
# make the output order by amount 

filteredRows.map(lambda kp: (kp[0], int(kp[3])) ).reduceByKey(lambda k,v: k + v).takeOrdered(10, key=lambda x: -x[1])

[('B02764', 1914449),
 ('B02617', 725025),
 ('B02682', 662509),
 ('B02598', 540791),
 ('B02765', 193670),
 ('B02512', 93786)]

# Read as pandas dataframe

In [63]:
df = pd.read_csv("uber-tlc-foil-response-master/Uber-Jan-Feb-FOIL.csv")

In [64]:
df.head(3)

Unnamed: 0,dispatching_base_number,date,active_vehicles,trips
0,B02512,1/1/2015,190,1132
1,B02765,1/1/2015,225,1765
2,B02764,1/1/2015,3427,29421


In [70]:
df.groupby('dispatching_base_number').sum().reset_index().sort('active_vehicles', ascending=False)

  if __name__ == '__main__':


Unnamed: 0,dispatching_base_number,active_vehicles,trips
4,B02764,217290,1914449
2,B02617,79758,725025
3,B02682,71431,662509
1,B02598,58653,540791
5,B02765,22575,193670
0,B02512,13125,93786


#  Load CSV with PySpark SQL 

In [71]:
# read CSV with PySpark SQL 

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)


df_test = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('uber-tlc-foil-response-master/Uber-Jan-Feb-FOIL.csv')

In [72]:
df_test.show()

+-----------------------+--------+---------------+-----+
|dispatching_base_number|    date|active_vehicles|trips|
+-----------------------+--------+---------------+-----+
|                 B02512|1/1/2015|            190| 1132|
|                 B02765|1/1/2015|            225| 1765|
|                 B02764|1/1/2015|           3427|29421|
|                 B02682|1/1/2015|            945| 7679|
|                 B02617|1/1/2015|           1228| 9537|
|                 B02598|1/1/2015|            870| 6903|
|                 B02598|1/2/2015|            785| 4768|
|                 B02617|1/2/2015|           1137| 7065|
|                 B02512|1/2/2015|            175|  875|
|                 B02682|1/2/2015|            890| 5506|
|                 B02765|1/2/2015|            196| 1001|
|                 B02764|1/2/2015|           3147|19974|
|                 B02765|1/3/2015|            201| 1526|
|                 B02617|1/3/2015|           1188|10664|
|                 B02598|1/3/20