# PySpark Simple Example : WordCount

## findspark

In [1]:
import findspark
findspark.init()
print("Done")

Done


## import

In [70]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import os
import glob

## Create SparkContext and SparkSession Object

In [4]:
sc = SparkContext('local', 'WordCount App')

In [6]:
spark = SparkSession.Builder().getOrCreate()

## Read Unstructured CSV file with SpackContext objext 'sc'

In [19]:
rdd1 = sc.textFile('input/python.txt')

In [55]:
rdd1.collect()

['Python Lists allow us to hold items of heterogeneous types. In this article, we will learn how to create a list in Python; access the list items; find the number of items in the list, how to add an item to list; how to remove an item from the list; loop through list items; sorting a list, reversing a list; and many more transformation and aggregation actions on Python Lists']

## Processing Data with RDD MapReduce functions

In [52]:
rdd1.flatMap(lambda x: x.split(' '))\
    .map(lambda x: (x,1))\
    .reduceByKey(lambda a, b: a+b)\
    .sortBy(lambda kv: kv[1], False)\
    .collect()[:10]

[('to', 5),
 ('the', 4),
 ('how', 3),
 ('a', 3),
 ('list', 3),
 ('list;', 3),
 ('Python', 2),
 ('Lists', 2),
 ('items', 2),
 ('of', 2)]

In [21]:
rdd1.collect()

['Python Lists allow us to hold items of heterogeneous types. In this article, we will learn how to create a list in Python; access the list items; find the number of items in the list, how to add an item to list; how to remove an item from the list; loop through list items; sorting a list, reversing a list; and many more transformation and aggregation actions on Python Lists']

In [25]:
type(rdd1)

pyspark.rdd.RDD

In [54]:
rdd2 = rdd1.flatMap(lambda x: x.split(' '))
rdd3 = rdd2.map(lambda x: (x, 1))
rdd4 = rdd3.reduceByKey(lambda a, b: a+b)
rdd5 = rdd4.sortBy(lambda kv: kv[1], False)
rdd5.collect()[0:5]

[('to', 5), ('the', 4), ('how', 3), ('a', 3), ('list', 3)]

## Read Stuctured CSV file with SparkSession object 'spark'

In [66]:
df1 = spark.read.csv(path='input/traffic_sim.txt', header = True)

In [69]:
type(df1)

pyspark.sql.dataframe.DataFrame

In [67]:
df1.show(10, False)

+-----------------------+--------+---------------+-----+
|dispatching_base_number|date    |active_vehicles|trips|
+-----------------------+--------+---------------+-----+
|B02512                 |1/1/2015|190            |1132 |
|B02765                 |1/1/2015|225            |1765 |
|B02764                 |1/1/2015|3427           |29421|
|B02682                 |1/1/2015|945            |7679 |
|B02617                 |1/1/2015|1228           |9537 |
|B02598                 |1/1/2015|870            |6903 |
|B02598                 |1/2/2015|785            |4768 |
|B02617                 |1/2/2015|1137           |7065 |
|B02512                 |1/2/2015|175            |875  |
|B02682                 |1/2/2015|890            |5506 |
+-----------------------+--------+---------------+-----+
only showing top 10 rows



In [68]:
df1.count()

354

## Processing Data with DataFrame functions

In [92]:
from pyspark.sql.functions import to_timestamp

In [91]:
df1.columns

['dispatching_base_number', 'date', 'active_vehicles', 'trips']

In [106]:
df2_ts = df1.withColumn("timestamp", F.to_timestamp(df1.date, "M/d/yyyy"))
df2_ts.show(5)

+-----------------------+--------+---------------+-----+-------------------+
|dispatching_base_number|    date|active_vehicles|trips|          timestamp|
+-----------------------+--------+---------------+-----+-------------------+
|                 B02512|1/1/2015|            190| 1132|2015-01-01 00:00:00|
|                 B02765|1/1/2015|            225| 1765|2015-01-01 00:00:00|
|                 B02764|1/1/2015|           3427|29421|2015-01-01 00:00:00|
|                 B02682|1/1/2015|            945| 7679|2015-01-01 00:00:00|
|                 B02617|1/1/2015|           1228| 9537|2015-01-01 00:00:00|
+-----------------------+--------+---------------+-----+-------------------+
only showing top 5 rows



In [149]:
df3_ts_y_m_d = df2_ts.withColumn('year', F.year('timestamp'))\
                      .withColumn('month', F.month('timestamp'))\
                      .withColumn('dayofmonth', F.dayofmonth('timestamp'))\
                      .withColumn('weekofyear', F.weekofyear('timestamp'))\
                      .sort(F.desc('month'))
df3_ts_y_m_d.show(5)

+-----------------------+--------+---------------+-----+-------------------+----+-----+----------+----------+
|dispatching_base_number|    date|active_vehicles|trips|          timestamp|year|month|dayofmonth|weekofyear|
+-----------------------+--------+---------------+-----+-------------------+----+-----+----------+----------+
|                 B02598|2/1/2015|            961| 9499|2015-02-01 00:00:00|2015|    2|         1|         5|
|                 B02764|2/1/2015|           3740|37468|2015-02-01 00:00:00|2015|    2|         1|         5|
|                 B02682|2/1/2015|           1214|12436|2015-02-01 00:00:00|2015|    2|         1|         5|
|                 B02512|2/1/2015|            193| 1377|2015-02-01 00:00:00|2015|    2|         1|         5|
|                 B02765|2/1/2015|            289| 2672|2015-02-01 00:00:00|2015|    2|         1|         5|
+-----------------------+--------+---------------+-----+-------------------+----+-----+----------+----------+
only showi

## How many distinc year is recorder in the traffic data?

In [150]:
df3_ts_y_m_d.select('year').distinct().show()

+----+
|year|
+----+
|2015|
+----+



## Which month see the most trips in 2015?

In [151]:
df3_ts_y_m_d.select('month').count()

354

In [152]:
df3_ts_y_m_d.collect()[0][2]

'961'

In [153]:
from pyspark.sql.types import IntegerType
df4_ts_y_m_d_int = df3_ts_y_m_d.withColumn('numtrips', df3_ts_y_m_d.trips.cast(IntegerType()) )

In [164]:
df4_ts_y_m_d_int.show(5)

+-----------------------+--------+---------------+-----+-------------------+----+-----+----------+----------+--------+
|dispatching_base_number|    date|active_vehicles|trips|          timestamp|year|month|dayofmonth|weekofyear|numtrips|
+-----------------------+--------+---------------+-----+-------------------+----+-----+----------+----------+--------+
|                 B02598|2/1/2015|            961| 9499|2015-02-01 00:00:00|2015|    2|         1|         5|    9499|
|                 B02764|2/1/2015|           3740|37468|2015-02-01 00:00:00|2015|    2|         1|         5|   37468|
|                 B02682|2/1/2015|           1214|12436|2015-02-01 00:00:00|2015|    2|         1|         5|   12436|
|                 B02512|2/1/2015|            193| 1377|2015-02-01 00:00:00|2015|    2|         1|         5|    1377|
|                 B02765|2/1/2015|            289| 2672|2015-02-01 00:00:00|2015|    2|         1|         5|    2672|
+-----------------------+--------+--------------

In [163]:
df4_ts_y_m_d_int.select('month', 'numtrips').groupBy('month').sum('numtrips').sort(F.desc('sum(numtrips)')).show()

+-----+-------------+
|month|sum(numtrips)|
+-----+-------------+
|    2|      2221581|
|    1|      1908649|
+-----+-------------+



## Which day see the most trips in February?

In [177]:
df5=df4_ts_y_m_d_int.select('dayofmonth', 'numtrips')\
        .where(df4_ts_y_m_d_int.month == 2)\
        .groupBy('dayofmonth')\
        .sum('numtrips')\
        .sort(F.desc('sum(numtrips)'))
df5.show(100)

+----------+-------------+
|dayofmonth|sum(numtrips)|
+----------+-------------+
|        20|       100915|
|        14|       100345|
|        21|        98380|
|        13|        98024|
|        15|        89401|
|        27|        88806|
|        19|        88757|
|        28|        88181|
|         6|        85940|
|        26|        83568|
|        12|        83234|
|         7|        81157|
|         5|        80913|
|        24|        79115|
|         1|        76910|
|        25|        74691|
|        17|        73051|
|        11|        72470|
|        18|        72243|
|        16|        72098|
|        23|        71217|
|         3|        70188|
|         2|        68980|
|         4|        66835|
|        22|        66440|
|        10|        64766|
|         8|        63000|
|         9|        61956|
+----------+-------------+



In [178]:
df5.columns

['dayofmonth', 'sum(numtrips)']

In [182]:
df6 = df5.withColumnRenamed('sum(numtrips)', 'monthly_trips')

In [183]:
df6.show()

+----------+-------------+
|dayofmonth|monthly_trips|
+----------+-------------+
|        20|       100915|
|        14|       100345|
|        21|        98380|
|        13|        98024|
|        15|        89401|
|        27|        88806|
|        19|        88757|
|        28|        88181|
|         6|        85940|
|        26|        83568|
|        12|        83234|
|         7|        81157|
|         5|        80913|
|        24|        79115|
|         1|        76910|
|        25|        74691|
|        17|        73051|
|        11|        72470|
|        18|        72243|
|        16|        72098|
+----------+-------------+
only showing top 20 rows

