# Dataframes from CSV files in Spark

### Source
https://www.nodalpoint.com/spark-dataframes-from-csv-files/

In [1]:
from pyspark import SparkContext
sc = SparkContext("local", "test App")

from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(sc)

In [2]:
# 讀檔方式 1
df1 = sqlContext.read\
                .format("csv")\
                .option("header", "true")\
                .load("nyctaxisub.csv")

In [3]:
# 讀檔方式 2
df = sqlContext.read.load("nyctaxisub.csv",
                          format="csv",
                          header="true",
                          inferSchema="true")
df.count()

249999

In [4]:
df.dtypes

[('_id', 'string'),
 ('_rev', 'string'),
 ('dropoff_datetime', 'timestamp'),
 ('dropoff_latitude', 'double'),
 ('dropoff_longitude', 'double'),
 ('hack_license', 'string'),
 ('medallion', 'string'),
 ('passenger_count', 'int'),
 ('pickup_datetime', 'timestamp'),
 ('pickup_latitude', 'double'),
 ('pickup_longitude', 'double'),
 ('rate_code', 'int'),
 ('store_and_fwd_flag', 'string'),
 ('trip_distance', 'double'),
 ('trip_time_in_secs', 'int'),
 ('vendor_id', 'string')]

In [5]:
# 對欄位做型態轉換 (其實這邊不必要，因為 timestamp 的判斷是正確的) 和更名
df = (df.withColumn("dropoff_datetime", df.dropoff_datetime.cast("timestamp"))
        .withColumn("pickup_datetime", df.pickup_datetime.cast("timestamp"))
        .withColumnRenamed("_id", "id")
        .withColumnRenamed("_rev", "rev")
        .withColumnRenamed("dropoff_latitude", "dropoff_lat")
        .withColumnRenamed("dropoff_longitude", "dropoff_long")
        .withColumnRenamed("pickup_latitude", "pickup_lat")
        .withColumnRenamed("pickup_longitude", "pickup_long"))
df.dtypes

[('id', 'string'),
 ('rev', 'string'),
 ('dropoff_datetime', 'timestamp'),
 ('dropoff_lat', 'double'),
 ('dropoff_long', 'double'),
 ('hack_license', 'string'),
 ('medallion', 'string'),
 ('passenger_count', 'int'),
 ('pickup_datetime', 'timestamp'),
 ('pickup_lat', 'double'),
 ('pickup_long', 'double'),
 ('rate_code', 'int'),
 ('store_and_fwd_flag', 'string'),
 ('trip_distance', 'double'),
 ('trip_time_in_secs', 'int'),
 ('vendor_id', 'string')]

In [6]:
df.describe().show() # 對數值欄位做 summary statistics

+-------+--------------------+--------------------+-----------------+------------------+--------------------+--------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+---------+
|summary|                  id|                 rev|      dropoff_lat|      dropoff_long|        hack_license|           medallion|   passenger_count|       pickup_lat|       pickup_long|         rate_code|store_and_fwd_flag|    trip_distance|trip_time_in_secs|vendor_id|
+-------+--------------------+--------------------+-----------------+------------------+--------------------+--------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+---------+
|  count|              249999|              249999|           249999|            249999|              249999|              249999|            249999|           249999|            249999| 

In [7]:
type(df.describe())

pyspark.sql.dataframe.DataFrame

In [8]:
# 注意每個欄位的型態其實都是 string
df.describe().dtypes

[('summary', 'string'),
 ('id', 'string'),
 ('rev', 'string'),
 ('dropoff_lat', 'string'),
 ('dropoff_long', 'string'),
 ('hack_license', 'string'),
 ('medallion', 'string'),
 ('passenger_count', 'string'),
 ('pickup_lat', 'string'),
 ('pickup_long', 'string'),
 ('rate_code', 'string'),
 ('store_and_fwd_flag', 'string'),
 ('trip_distance', 'string'),
 ('trip_time_in_secs', 'string'),
 ('vendor_id', 'string')]

In [9]:
# 修改 describe() 傳回的精度
def prettySummary(df):
    '''
    Neat summary statistics of a Spark dataframe
    Args:
    pyspark.sql.dataframe.DataFrame (df): input dataframe
    Returns:
    pandas.core.frame.DataFrame: a pandas dataframe with the summary statistics of df
    '''
    import pandas as pd
    temp = df.describe().toPandas()
    temp.iloc[1:3, 1:] = temp.iloc[1:3, 1:].convert_objects(convert_numeric=True)
    pd.options.display.float_format = "{:, .2f}".format
    return temp

In [10]:
prettySummary(df)

AttributeError: 'DataFrame' object has no attribute 'convert_objects'

In [11]:
df.filter(df.dropoff_lat < 0).count()

3

In [12]:
df.filter(df.dropoff_lat < 10).count()

4715

In [13]:
df.filter(df.trip_distance == 0.0).count()

290

In [14]:
df.filter(df.dropoff_long > -50).count()

4751

In [15]:
df.filter(df[13] == 0.0).count() # 用欄位的數值 index 也行

290

In [16]:
# 兩個以上的條件要用 & 合併且每個條件要用括號括起來
df.filter((df.dropoff_lat < 10) & (df.dropoff_long > -50)).count()

4710

In [17]:
df.filter(df.dropoff_lat < 10)\
  .filter(df.dropoff_long > -50)\
  .count() # 也可以用連續的 filter 組合在一起

4710

In [18]:
from pyspark.sql.functions import *
df.select(max("pickup_datetime")).show() # max(), min() 也可以用在 timestamp 上面

+--------------------+
|max(pickup_datetime)|
+--------------------+
| 2013-11-26 23:46:38|
+--------------------+



In [19]:
df.select(max("dropoff_datetime")).show()

+---------------------+
|max(dropoff_datetime)|
+---------------------+
|  2013-11-26 23:59:57|
+---------------------+



In [20]:
df.select(min("pickup_datetime")).show()

+--------------------+
|min(pickup_datetime)|
+--------------------+
| 2013-01-10 21:27:01|
+--------------------+



In [21]:
df.select(min("dropoff_datetime")).show()

+---------------------+
|min(dropoff_datetime)|
+---------------------+
|  2013-01-11 00:00:00|
+---------------------+



In [22]:
# 檢查時間 pickup_datetime 必須要比 dropoff_datetime 還早才行
df_dates = df.select(df["pickup_datetime"], df["dropoff_datetime"])
df_dates.show(5, truncate=False)

+-------------------+-------------------+
|pickup_datetime    |dropoff_datetime   |
+-------------------+-------------------+
|2013-01-11 21:48:00|2013-01-11 22:03:00|
|2013-01-11 04:07:00|2013-01-11 04:28:00|
|2013-01-11 21:46:00|2013-01-11 22:02:00|
|2013-01-11 09:44:00|2013-01-11 10:03:00|
|2013-01-11 21:48:00|2013-01-11 22:02:00|
+-------------------+-------------------+
only showing top 5 rows



In [23]:
df_dates = df_dates.withColumn("pickup_lst", df_dates.pickup_datetime < df_dates.dropoff_datetime)

In [24]:
df_dates.filter(~df_dates.pickup_lst).count() # 用 ~ 來表示 not

0

In [25]:
df.groupBy(month(df.dropoff_datetime)).count().show() # 用 month() 可以取出月份的資訊

+-----------------------+-----+
|month(dropoff_datetime)|count|
+-----------------------+-----+
|                      1|91075|
|                     11|94007|
|                      2|64917|
+-----------------------+-----+



In [26]:
type(df.select(max("pickup_datetime")).show()) # show() 是用來看結果的，不會回傳任何東西

+--------------------+
|max(pickup_datetime)|
+--------------------+
| 2013-11-26 23:46:38|
+--------------------+



NoneType

In [27]:
df.select(max("pickup_datetime")).collect() # 要回傳東西的話，需要用 collect() 會回傳一個 Row 的 list

[Row(max(pickup_datetime)=datetime.datetime(2013, 11, 26, 23, 46, 38))]

In [28]:
type(df.select(max("pickup_datetime")).collect())

list

In [29]:
max_pickup = df.select(max("pickup_datetime")).collect()[0][0]
max_pickup

datetime.datetime(2013, 11, 26, 23, 46, 38)

In [30]:
from datetime import *
max_pickup < datetime(2013, 12, 31)

True

In [31]:
df.registerTempTable("taxi") # 註冊一個暫時的表格，這樣就可以直接使用 SQL query
sqlContext.sql("SELECT vendor_id, COUNT(*) FROM taxi GROUP BY vendor_id").show()

+---------+--------+
|vendor_id|count(1)|
+---------+--------+
|      CMT|  114387|
|      VTS|  135612|
+---------+--------+



In [32]:
sqlContext.sql("SELECT vendor_id, COUNT(*) FROM taxi GROUP BY vendor_id").collect()

[Row(vendor_id='CMT', count(1)=114387), Row(vendor_id='VTS', count(1)=135612)]

In [33]:
type(sqlContext.sql("SELECT vendor_id, COUNT(*) FROM taxi GROUP BY vendor_id").collect())

list

In [34]:
# withColumnRenamed() 也可以用在 sql context
sqlContext.sql("SELECT vendor_id, COUNT(*) FROM taxi GROUP BY vendor_id").withColumnRenamed("_c1", "count").show()

+---------+--------+
|vendor_id|count(1)|
+---------+--------+
|      CMT|  114387|
|      VTS|  135612|
+---------+--------+

