In [27]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *


def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()


# business = sc.textFile("../Data/filtered_registered_business_sf.csv")\
#              .map(lambda x : x.split(','))\
#              .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))
              

# schema = StructType([ StructField("zip", IntegerType(), True),
#                       StructField("name", StringType(), False),
#                       StructField("street", StringType(), True),
#                       StructField("city", StringType(), True),
#                       StructField("state", StringType(), True)
#                     ])

# business_df = ss.createDataFrame(business, schema)

In [28]:
business = sc.textFile('../Data/SF_business/filtered_registered_business_sf.csv').\
            map(lambda x: x.split(',')).\
            map(lambda x: [IntegerSafe(x[0]),x[1],
                            x[2],x[3],x[4]])
business.first()

schema = StructType([StructField('zip',IntegerType(),True),
                    StructField('name',StringType(),False),
                     StructField('street',StringType(),True),
                     StructField('city', StringType(),True),
                     StructField('state',StringType(),True)
                    
])
business_df = ss.createDataFrame(business,schema)
business_df.show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



## print 5 zip code with the most businesses

In [36]:
business_df.groupby('zip')\
            .agg(count('name').alias('count'))\
            .orderBy('count',ascending=0)\
            .show(5)
            # .orderBy('count',ascending=0)\
            # .select('zip','count').show(5)

                                                                                

+-----+-----+
|  zip|count|
+-----+-----+
|94110|12459|
|94103|10919|
|94109| 9623|
|94107| 9394|
|94102| 7962|
+-----+-----+
only showing top 5 rows



In [19]:
business_df.groupby('zip')\
            .count()\
            .orderBy('count',ascending=0)\
            .select('zip','count').show(5)

+-----+-----+
|  zip|count|
+-----+-----+
|94110|12459|
|94103|10919|
|94109| 9623|
|94107| 9394|
|94102| 7962|
+-----+-----+
only showing top 5 rows



In [22]:
business_df.withColumn('onHoward',business_df['street'].contains('Howard')==1)\
            .where('onHoward==1')\
            .show(5)

+-----+--------------------+--------------+-------------+-----+--------+
|  zip|                name|        street|         city|state|onHoward|
+-----+--------------------+--------------+-------------+-----+--------+
|94105|Stephens Institut...| 631 Howard St|San Francisco|   CA|    true|
|94103|Anderson Enterpri...|1525 Howard St|San Francisco|   CA|    true|
|94103|Avis Rent A Car S...| 821 Howard St|San Francisco|   CA|    true|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|    true|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|    true|
+-----+--------------------+--------------+-------------+-----+--------+
only showing top 5 rows



In [16]:
business_df.groupby('zip').count().sort('count',ascending=0).show(5)

[Stage 14:>                                                         (0 + 2) / 2]

+-----+-----+
|  zip|count|
+-----+-----+
|94110|12459|
|94103|10919|
|94109| 9623|
|94107| 9394|
|94102| 7962|
+-----+-----+
only showing top 5 rows



                                                                                

## Create a column named "onHoward" to see whether it is on Howard street

In [22]:
business_df.withColumn('onHoward',business_df['street'].contains('Howard'))\
            .filter('onHoward == 1').show(5)

+-----+--------------------+--------------+-------------+-----+--------+
|  zip|                name|        street|         city|state|onHoward|
+-----+--------------------+--------------+-------------+-----+--------+
|94105|Stephens Institut...| 631 Howard St|San Francisco|   CA|    true|
|94103|Anderson Enterpri...|1525 Howard St|San Francisco|   CA|    true|
|94103|Avis Rent A Car S...| 821 Howard St|San Francisco|   CA|    true|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|    true|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|    true|
+-----+--------------------+--------------+-------------+-----+--------+
only showing top 5 rows



In [37]:
sc.stop()