## Importing dependencies

In [30]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

import seaborn as sns
import pandas as pd
import numpy as np

## Loading dataset

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [3]:
data = sns.load_dataset('taxis')

In [4]:
data.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


## Creating a spark session and converting to a spark dataframe

In [23]:
spark = SparkSession.builder.appName("Learning Spark").getOrCreate()

data_spark = spark.createDataFrame(data)

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


In [24]:
data_spark.show(1)

+-------------------+-------------------+----------+--------+----+----+-----+-----+------+-----------+---------------+-------------------+--------------+---------------+
|             pickup|            dropoff|passengers|distance|fare| tip|tolls|total| color|    payment|    pickup_zone|       dropoff_zone|pickup_borough|dropoff_borough|
+-------------------+-------------------+----------+--------+----+----+-----+-----+------+-----------+---------------+-------------------+--------------+---------------+
|2019-03-23 20:21:09|2019-03-23 20:27:24|         1|     1.6| 7.0|2.15|  0.0|12.95|yellow|credit card|Lenox Hill West|UN/Turtle Bay South|     Manhattan|      Manhattan|
+-------------------+-------------------+----------+--------+----+----+-----+-----+------+-----------+---------------+-------------------+--------------+---------------+
only showing top 1 row



## Comparing operations in pandas and pyspark:
##### 1. pandasDataFrame.info() vs pysparkDataFrame.printSchema()

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [26]:
data_spark.printSchema()

root
 |-- pickup: timestamp (nullable = true)
 |-- dropoff: timestamp (nullable = true)
 |-- passengers: long (nullable = true)
 |-- distance: double (nullable = true)
 |-- fare: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- total: double (nullable = true)
 |-- color: string (nullable = true)
 |-- payment: string (nullable = true)
 |-- pickup_zone: string (nullable = true)
 |-- dropoff_zone: string (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)



## String split operations in one column

In [34]:
data["dropoff_zone"].value_counts(dropna=False).to_dict()

{'Upper East Side North': 245,
 'Murray Hill': 220,
 'Midtown Center': 215,
 'Upper East Side South': 177,
 'Midtown East': 176,
 'Times Sq/Theatre District': 160,
 'Lincoln Square East': 156,
 'Clinton East': 151,
 'East Village': 150,
 'Penn Station/Madison Sq West': 148,
 'Union Sq': 139,
 'Upper West Side South': 135,
 'Lenox Hill West': 134,
 'Midtown North': 131,
 'Upper West Side North': 130,
 'East Chelsea': 120,
 'Midtown South': 116,
 'Lenox Hill East': 112,
 'Gramercy': 112,
 'Yorkville West': 107,
 'East Harlem South': 104,
 'West Village': 101,
 'West Chelsea/Hudson Yards': 101,
 'Sutton Place/Turtle Bay North': 99,
 'Garment District': 97,
 'UN/Turtle Bay South': 95,
 'TriBeCa/Civic Center': 81,
 'Central Park': 81,
 'Greenwich Village North': 78,
 'Kips Bay': 76,
 'Flatiron': 74,
 'Lower East Side': 73,
 'Central Harlem North': 69,
 'Lincoln Square West': 69,
 'Clinton West': 66,
 'East Harlem North': 65,
 'LaGuardia Airport': 65,
 'Yorkville East': 65,
 'Central Harlem'

In [40]:
def split_function(x):
    try:
        try:
            if type(x)!= float:
                (v1,v2) = x.split('/')
                return v1,v2
        except Exception as e:
            if type(x)!= float:
                v1 = x.split('/')
                v2 = np.nan
                return v1,v2
    except TypeError as te:
        return (np.nan,np.nan)
        
data["area"]=data["dropoff_zone"].apply(lambda x:split_function(x)[0])
data["sub_zone"]=data["dropoff_zone"].apply(lambda x: split_function(x)[1])

TypeError: 'NoneType' object is not subscriptable

In [27]:
split_col = f.split(data_spark["dropoff_zone"],pattern="/")
data_spark=data_spark.withColumn("area",split_col.getItem(0)).withColumn("sub-zone", split_col.getItem(1))

In [28]:
data_spark.show(1)

+-------------------+-------------------+----------+--------+----+----+-----+-----+------+-----------+---------------+-------------------+--------------+---------------+----+----------------+
|             pickup|            dropoff|passengers|distance|fare| tip|tolls|total| color|    payment|    pickup_zone|       dropoff_zone|pickup_borough|dropoff_borough|area|        sub-zone|
+-------------------+-------------------+----------+--------+----+----+-----+-----+------+-----------+---------------+-------------------+--------------+---------------+----+----------------+
|2019-03-23 20:21:09|2019-03-23 20:27:24|         1|     1.6| 7.0|2.15|  0.0|12.95|yellow|credit card|Lenox Hill West|UN/Turtle Bay South|     Manhattan|      Manhattan|  UN|Turtle Bay South|
+-------------------+-------------------+----------+--------+----+----+-----+-----+------+-----------+---------------+-------------------+--------------+---------------+----+----------------+
only showing top 1 row

