In [1]:
import pandas as pd
from sodapy import Socrata
from configparser import ConfigParser

In [2]:
# Setup configuration
config = ConfigParser()
config.read('./config.ini')
app_token = config['socrata']['APP_TOKEN']

# Create client to Socrata
client = Socrata(domain='data.cityofnewyork.us', app_token=app_token, timeout=60)

# NYC 311 Calls (2010-Present)
dataset = 'erm2-nwe9'

In [3]:
def get_query(dataset_identifier: str, query: str, return_df: bool=False) -> pd.DataFrame | list:
    results = client.get(dataset_identifier=dataset_identifier, query=query)
    if return_df:
        return pd.DataFrame.from_records(results)
    else:
        return results

In [10]:
query = (
    """
    SELECT
        unique_key,
        created_date,
        descriptor,
        incident_zip,
        community_board,
        latitude,
        longitude
    WHERE
        complaint_type = "Dead Animal"
        AND incident_zip IS NOT NULL
        AND community_board IS NOT NULL
        AND latitude IS NOT NULL
        AND longitude IS NOT NULL
    ORDER BY
        created_date DESC,
        unique_key DESC
    LIMIT
        20000
    """
)
df = get_query(dataset_identifier=dataset, query=query, return_df=True)

In [11]:
df.head(3)

Unnamed: 0,unique_key,created_date,descriptor,incident_zip,community_board,latitude,longitude
0,56525294,2023-01-15T00:07:57.000,Cat,11223,15 BROOKLYN,40.60753845957142,-73.96466250125461
1,56525565,2023-01-14T20:32:14.000,Squirrel,11417,10 QUEENS,40.67045386926805,-73.84068337208743
2,56530227,2023-01-14T18:22:23.000,Squirrel,10306,02 STATEN ISLAND,40.57037481976175,-74.11126273634564


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12075 entries, 0 to 12074
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   unique_key       12075 non-null  object
 1   created_date     12075 non-null  object
 2   descriptor       12075 non-null  object
 3   incident_zip     12075 non-null  object
 4   community_board  12075 non-null  object
 5   latitude         12075 non-null  object
 6   longitude        12075 non-null  object
dtypes: object(7)
memory usage: 660.5+ KB


In [13]:
# Convert created date to datetime object
df['created_date'] = pd.to_datetime(df['created_date'])

# Convert latitude and longitude to double
df['latitude'] = pd.to_numeric(df['latitude'])
df['longitude'] = pd.to_numeric(df['longitude'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12075 entries, 0 to 12074
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   unique_key       12075 non-null  object        
 1   created_date     12075 non-null  datetime64[ns]
 2   descriptor       12075 non-null  object        
 3   incident_zip     12075 non-null  object        
 4   community_board  12075 non-null  object        
 5   latitude         12075 non-null  float64       
 6   longitude        12075 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 660.5+ KB


In [15]:
filename = './test.parquet'

In [17]:
df.to_parquet(filename)

In [18]:
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/16 20:44:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [20]:
new_df = spark.read.parquet(filename)

                                                                                

In [22]:
new_df.printSchema()
new_df.show(n=5, truncate=10)

root
 |-- unique_key: string (nullable = true)
 |-- created_date: timestamp (nullable = true)
 |-- descriptor: string (nullable = true)
 |-- incident_zip: string (nullable = true)
 |-- community_board: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

+----------+------------+----------+------------+---------------+----------+----------+
|unique_key|created_date|descriptor|incident_zip|community_board|  latitude| longitude|
+----------+------------+----------+------------+---------------+----------+----------+
|  56525294|  2023-01...|       Cat|       11223|     15 BROO...|40.6075...|-73.964...|
|  56525565|  2023-01...|  Squirrel|       11417|      10 QUEENS|40.6704...|-73.840...|
|  56530227|  2023-01...|  Squirrel|       10306|     02 STAT...|40.5703...|-74.111...|
|  56527404|  2023-01...|       Cat|       11420|      10 QUEENS|40.6790...|-73.809...|
|  56526670|  2023-01...|       Cat|       10310|     01 STAT...|40.6297...