In [1]:
# To find the correct pyspark in 
import findspark
findspark.init()
# import pyspark


In [36]:
import databricks.koalas as ks
import numpy as np
import pandas as pd
import os
import sys

### Read CSV File.

In [4]:
# location of data
data_path = os.path.join('data',
                         'nyc_restaurant_inspection_results_sample1.csv')

In [5]:
# import to kolas df
df = ks.read_csv(data_path)

In [6]:
# import to pandas df
pddf = pd.read_csv(data_path)

### Memory usage 

In [7]:
print('koalas memory usage is {m} bytes.'.format(m=sys.getsizeof(df)))
print('pandas memory usage is {m:.2f} megabytes.'.format(m=sys.getsizeof(pddf)/10**6))

koalas memory usage is 56 bytes.
pandas memory usage is 1.27 megabytes.


##  Selecting Rows and Columns
### Using __loc__

In [8]:
df.loc[90:100,'DBA']

90           XIANG ZHI RESTAURANT
91             10 BELOW ICE CREAM
92         NEW H.K. TEA AND SUSHI
93     ELLIE'S DINER & RESTAURANT
94      FORDHAM SEAFOOD & CHICKEN
95                       SAPIENZA
96                SAHARA NEW YORK
97              JERRY'S 637 DINER
98                  SUPREME DINER
99                      PORTOFINO
100               MURPHY'S TAVERN
Name: DBA, dtype: object

### Difference between pandas and koalas in __iloc__ usage

__iloc__ in koalas  does not allow the beginning of the row index to be assigned. 

i.e. __df.iloc[0:100,1]__ or __df.iloc[20:30,1:4]__ will not work in koalas. 

However, __df.iloc[:100,1]__ or __df.iloc[:30,1:4]__ will work.

In [9]:
# In koalas the above selection won't work.
df.iloc[:100,1:4]

Unnamed: 0,DBA,BORO,BUILDING
0,GOLDEN KRUST CARIBBEAN BAKERY & GRILL,Brooklyn,1887
1,CARIBBEAN JERKEE'S DELIGHT,Brooklyn,932
2,CARVEL ICE CREAM,Brooklyn,7517
3,ROYAL KING'S PIZZA,Brooklyn,5211
4,NACHO MACHO TACO,Brooklyn,82
5,JIAN ON CHINESE RESTAURANT,Queens,26805
6,BAGEL CLUB,Queens,20521
7,LUNA PIZZERIA,Brooklyn,1115
8,BOHAUS COFFEE AND FLOWERS,Brooklyn,406
9,IL MULINO,Manhattan,86


In [10]:
df.loc[:5,['INSPECTION DATE','Census Tract']]

Unnamed: 0,INSPECTION DATE,Census Tract
0,08/16/2017,102200.0
1,05/17/2017,86000.0
2,01/16/2019,6600.0
3,12/19/2018,10000.0
4,05/24/2017,12901.0
5,07/10/2017,157901.0


In [11]:
(df.loc[(df['Census Tract'] > 10000) &
        (df['BORO'] == 'Brooklyn'),
 ['INSPECTION DATE','Census Tract','BORO']])

Unnamed: 0,INSPECTION DATE,Census Tract,BORO
0,08/16/2017,102200.0,Brooklyn
1,05/17/2017,86000.0,Brooklyn
4,05/24/2017,12901.0,Brooklyn
7,04/25/2017,55400.0,Brooklyn
8,10/23/2017,27500.0,Brooklyn
15,03/28/2019,41800.0,Brooklyn
26,08/16/2016,42300.0,Brooklyn
27,01/19/2019,31300.0,Brooklyn
30,10/26/2016,50500.0,Brooklyn
31,06/25/2018,33700.0,Brooklyn


## Column Manipulations

### Change column names

In [12]:
print(df.columns)

Index(['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'CRITICAL FLAG', 'SCORE', 'GRADE',
       'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE', 'Latitude', 'Longitude',
       'Community Board', 'Council District', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')


In [15]:
# change column name to lower case and replace ' ' with '_'.
df.columns = [ name.lower().replace(' ','_') for name in df.columns]
df.columns

Index(['camis', 'dba', 'boro', 'building', 'street', 'zipcode', 'phone',
       'cuisine_description', 'inspection_date', 'action', 'violation_code',
       'violation_description', 'critical_flag', 'score', 'grade',
       'grade_date', 'record_date', 'inspection_type', 'latitude', 'longitude',
       'community_board', 'council_district', 'census_tract', 'bin', 'bbl',
       'nta'],
      dtype='object')

In [16]:
df.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,41687715,GOLDEN KRUST CARIBBEAN BAKERY & GRILL,Brooklyn,1887,ROCKAWAY PARKWAY,11236.0,7182094242,Caribbean,08/16/2017,Violations were cited in the following area(s).,02H,Food not cooled by an approved method whereby ...,Y,43.0,C,08/16/2017,09/18/2019,Cycle Inspection / Re-inspection,40.636304,-73.892472,318.0,46.0,102200.0,3233728.0,3082810000.0,BK50
1,50001450,CARIBBEAN JERKEE'S DELIGHT,Brooklyn,932,UTICA AVE,11203.0,5163438092,Caribbean,05/17/2017,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,N,11.0,A,05/17/2017,09/18/2019,Cycle Inspection / Re-inspection,40.650458,-73.930256,317.0,45.0,86000.0,3102839.0,3046970000.0,BK91
2,50013528,CARVEL ICE CREAM,Brooklyn,7517,3RD AVE,11209.0,7187455200,American,01/16/2019,Violations were cited in the following area(s).,04K,Evidence of rats or live rats present in facil...,Y,10.0,A,01/16/2019,09/18/2019,Cycle Inspection / Initial Inspection,40.631268,-74.027856,310.0,43.0,6600.0,3148708.0,3059400000.0,BK31
3,40396492,ROYAL KING'S PIZZA,Brooklyn,5211,5 AVENUE,11220.0,7184923846,Pizza,12/19/2018,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Y,12.0,A,12/19/2018,09/18/2019,Cycle Inspection / Initial Inspection,40.64385,-74.011603,307.0,38.0,10000.0,3013939.0,3008080000.0,BK34
4,50058248,NACHO MACHO TACO,Brooklyn,82,5TH AVE,11217.0,7186228282,Mexican,05/24/2017,Violations were cited in the following area(s).,06F,Wiping cloths soiled or not stored in sanitizi...,Y,25.0,,,09/18/2019,Cycle Inspection / Initial Inspection,40.680557,-73.977661,306.0,39.0,12901.0,3018668.0,3009340000.0,BK37


### Change column type

In [17]:
df.inspection_date = df.inspection_date.astype(str)

### Creating New Columns 
 Using __DataFrame.assign__, a new column can be created but it will also generate the new dataframe where the new column is attached to the previous dataframe. In the following, we convert *inspection_date* column from __str__ to __datetime__ column. 

In [132]:
df_new  = df.assign(inspection_date_dt = lambda x : ks.to_datetime(x.inspection_date,
                                                        format='%m/%d/%Y',
                                                         errors='coerce'))
df_new.head(3)

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,inspection_date_dt
0,41687715,GOLDEN KRUST CARIBBEAN BAKERY & GRILL,Brooklyn,1887,ROCKAWAY PARKWAY,11236.0,7182094242,Caribbean,08/16/2017,Violations were cited in the following area(s).,02H,Food not cooled by an approved method whereby ...,Y,43.0,C,08/16/2017,09/18/2019,Cycle Inspection / Re-inspection,40.636304,-73.892472,318.0,46.0,102200.0,3233728.0,3082810000.0,BK50,2017-08-16
1,50001450,CARIBBEAN JERKEE'S DELIGHT,Brooklyn,932,UTICA AVE,11203.0,5163438092,Caribbean,05/17/2017,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,N,11.0,A,05/17/2017,09/18/2019,Cycle Inspection / Re-inspection,40.650458,-73.930256,317.0,45.0,86000.0,3102839.0,3046970000.0,BK91,2017-05-17
2,50013528,CARVEL ICE CREAM,Brooklyn,7517,3RD AVE,11209.0,7187455200,American,01/16/2019,Violations were cited in the following area(s).,04K,Evidence of rats or live rats present in facil...,Y,10.0,A,01/16/2019,09/18/2019,Cycle Inspection / Initial Inspection,40.631268,-74.027856,310.0,43.0,6600.0,3148708.0,3059400000.0,BK31,2019-01-16


In [19]:
df_new['inspection_date_dt']

0     2017-08-16
1     2017-05-17
2     2019-01-16
3     2018-12-19
4     2017-05-24
5     2017-07-10
6     2019-08-12
7     2017-04-25
8     2017-10-23
9     2016-09-26
10    2016-04-15
11    2019-01-25
12    2018-01-03
13    2019-02-20
14    2017-10-30
15    2019-03-28
16    2016-11-02
17    2018-12-07
18    2019-07-31
19    2019-08-01
20    2019-04-26
21    2017-12-06
22    2019-08-31
23    2018-10-03
24    2017-01-05
25    2018-11-07
26    2016-08-16
27    2019-01-19
28    2019-04-16
29    2016-11-09
30    2016-10-26
31    2018-06-25
32    2019-06-13
33    2017-05-16
34    2016-04-17
35    2018-05-24
36    2016-08-10
37    2016-08-15
38    2019-01-04
39    2018-10-30
40    2018-10-04
41    2019-02-25
42    2018-11-05
43    2018-12-04
44    2016-09-07
45    2018-12-05
46    2019-08-08
47    2018-06-28
48    2019-04-12
49    2018-05-23
50    2018-12-31
51    2019-08-02
52    2019-09-12
53    2017-11-06
54    2019-02-07
55    2018-07-12
56    2018-04-12
57    2017-08-11
58    2019-05-

### Filter By Datetime

In [20]:
df_new.loc[df_new['inspection_date_dt'].dt.year > 2017 ].head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,inspection_date_dt
2,50013528,CARVEL ICE CREAM,Brooklyn,7517,3RD AVE,11209.0,7187455200,American,01/16/2019,Violations were cited in the following area(s).,04K,Evidence of rats or live rats present in facil...,Y,10.0,A,01/16/2019,09/18/2019,Cycle Inspection / Initial Inspection,40.631268,-74.027856,310.0,43.0,6600.0,3148708.0,3059400000.0,BK31,2019-01-16
3,40396492,ROYAL KING'S PIZZA,Brooklyn,5211,5 AVENUE,11220.0,7184923846,Pizza,12/19/2018,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Y,12.0,A,12/19/2018,09/18/2019,Cycle Inspection / Initial Inspection,40.64385,-74.011603,307.0,38.0,10000.0,3013939.0,3008080000.0,BK34,2018-12-19
6,50092588,BAGEL CLUB,Queens,20521,35TH AVE,11361.0,7184236106,Bagels/Pretzels,08/12/2019,Violations were cited in the following area(s).,06F,Wiping cloths soiled or not stored in sanitizi...,Y,30.0,Z,08/12/2019,09/18/2019,Pre-permit (Operational) / Re-inspection,40.766748,-73.782586,411.0,19.0,109900.0,4135513.0,4060900000.0,QN46,2019-08-12
11,41586413,CORNERSTONE CAFE,Manhattan,17,AVENUE B,10009.0,2122281260,American,01/25/2019,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or condit...,N,10.0,A,01/25/2019,09/18/2019,Cycle Inspection / Re-inspection,40.722106,-73.983361,103.0,2.0,2202.0,1004587.0,1003850000.0,MN28,2019-01-25
12,50018980,MOKJA,Queens,3519,BROADWAY,11106.0,7187210654,Korean,01/03/2018,Violations were cited in the following area(s).,04N,Filth flies or food/refuse/sewage-associated (...,Y,10.0,,,09/18/2019,Cycle Inspection / Initial Inspection,40.760372,-73.921968,401.0,22.0,6100.0,4009761.0,4006480000.0,QN70,2018-01-03


In [21]:
df_new['boro'].value_counts()

Manhattan        396
Brooklyn         272
Queens           213
Bronx             88
Staten Island     31
Name: boro, dtype: int64

### Currently index to list is not available. So the workaround is to convert the index to a column and then convert to a list. 
We have about 59 different descriptions for `cuisine_description` (`['American', 'Chinese', 'Pizza', 'Italian',...` ) and we are going to keep the top five descriptions and replace the reset with 'other'. 

In [63]:
# 59 unique cuisine description.
# ks.unique is not working right now. 
df_new['cuisine_description'].value_counts().shape

(59,)

In [125]:
top5_cuisines = (df_new['cuisine_description']
                .value_counts()
                .head(5)
                .reset_index().iloc[:,0].tolist())
print(top5_cuisines)

['American', 'Chinese', 'Pizza', 'Italian', 'Latin (Cuban, Dominican, Puerto Rican, South & Central American)']


In [133]:
# Function has to have return type hint. This is different from pandas.
def replace_cuisines(x,list2exclude) -> str:
    if x not in list2exclude:
        x = 'other'
    return x

In [175]:
# add new column and reasign to the previous dataframe df_new
df_new = df_new.assign(cuisine_mod = df_new.cuisine_description
              .apply(replace_cuisines, args=(top5_cuisines,)))

## Save File

In [179]:
# Write a file to a folder named 'data'.
df_new.to_csv(os.path.join('data','sample_mod.csv'))

## Merge, Join & Concatenate

In [214]:
file1 = os.path.join('data','nyc_restaurant_inspection_results_sample1.csv')
file2 = os.path.join('data','nyc_restaurant_inspection_results_sample2.csv')
df1 = ks.read_csv(file1)
df2 = ks.read_csv(file2)
                 
print('df1 dimension = {}'.format(df1.shape))
print('df2 dimension = {}'.format(df2.shape))

df1 dimension = (600, 26)
df2 dimension = (500, 26)


In [215]:
join_df = df1.append(df2,ignore_index=True )

In [216]:
print(join_df.shape)

(1100, 26)


In [219]:
df1.iloc[:100,:3].join(df2.iloc[:100,2:4],
                       rsuffix='_right')

Unnamed: 0,CAMIS,DBA,BORO,BORO_right,BUILDING
0,41687715,GOLDEN KRUST CARIBBEAN BAKERY & GRILL,Brooklyn,Manhattan,34
1,50001450,CARIBBEAN JERKEE'S DELIGHT,Brooklyn,Queens,4021
2,50013528,CARVEL ICE CREAM,Brooklyn,Staten Island,27
3,40396492,ROYAL KING'S PIZZA,Brooklyn,Manhattan,68
4,50058248,NACHO MACHO TACO,Brooklyn,Queens,9507
5,40871752,JIAN ON CHINESE RESTAURANT,Queens,Manhattan,151
6,50092588,BAGEL CLUB,Queens,Brooklyn,1902
7,50060077,LUNA PIZZERIA,Brooklyn,Brooklyn,1789
8,50050890,BOHAUS COFFEE AND FLOWERS,Brooklyn,Manhattan,2286
9,40373888,IL MULINO,Manhattan,Brooklyn,1020
