# **Web Scrapping para extraer la data comprimida en los archivos parquet de la pagina TLC NYC**
# ☝

In [None]:
# This code snippet is used to connect your Google Drive to your Colab environment
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files

uploaded = files.upload()

Saving fhv_tripdata_2018-08.parquet to fhv_tripdata_2018-08.parquet


In [None]:
import pandas as pd

#### Filtrar registros en archivo parquet cuya fecha esta fuera de rango por error de registro o digitacion, por eplo año 3019 en vez de 2019, lo mas sencillo es ignorar esos registros porque corregir un archivo parquet puede afectar la estructura del mismo, es o en el archivo 2019-febrero, marzo, abril, julio de fhv, lo mismo para 2018 junio, agosto

#### pyarrow es una biblioteca de Python que se utiliza principalmente para el procesamiento eficiente de datos y la interoperabilidad de datos entre diferentes sistemas

#### Aqui vamos a utilizarla para ignorar filas que estan fuera de rango por fecha

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

table = pq.read_table("fhv_tripdata_2018-08.parquet")
df = table.filter(
    pc.less_equal(table["dropOff_datetime"], pa.scalar(pd.Timestamp.max))
).to_pandas()

### Codigo para averiguar fecha en formato yyyy-mm-dd a partir de timestamp

In [None]:
import datetime

In [None]:
timestamp_in_microseconds = 33106123800000000
timestamp_in_seconds = timestamp_in_microseconds / 1_000_000  # Convert to seconds
datetime_obj = datetime.datetime.fromtimestamp(timestamp_in_seconds)
print(datetime_obj)

3019-02-03 17:30:00


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22120591 entries, 0 to 22120590
Data columns (total 7 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   dispatching_base_num    object        
 1   pickup_datetime         datetime64[ns]
 2   dropOff_datetime        datetime64[ns]
 3   PUlocationID            float64       
 4   DOlocationID            float64       
 5   SR_Flag                 float64       
 6   Affiliated_base_number  object        
dtypes: datetime64[ns](2), float64(3), object(2)
memory usage: 1.2+ GB


In [None]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2018-08-01 00:24:39,2018-08-01 00:27:22,,,,B00013
1,B00013,2018-08-01 00:53:02,2018-08-01 01:24:19,,,,B00013
2,B00013,2018-08-01 00:51:52,2018-08-01 01:18:37,,,,B00013
3,B00013,2018-08-01 00:40:33,2018-08-01 01:30:15,,,,B00013
4,B00013,2018-08-01 00:24:38,2018-08-01 00:26:13,,,,B00013
...,...,...,...,...,...,...,...
22120586,B03065,2018-08-31 23:19:19,2018-08-31 23:33:44,,265.0,,B02058
22120587,B03069,2018-08-31 23:05:00,2018-08-31 23:51:00,,,,B03069
22120588,B03069,2018-08-31 23:50:00,2018-09-01 01:01:00,,,,B03069
22120589,B03106,2018-08-31 23:07:32,2018-08-31 23:39:39,,265.0,,B03106


In [None]:
manhattan_zones = [  4,  12,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,
        79,  87,  88,  90, 100, 103, 104, 105, 107, 113, 114, 116, 120,
       125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153,
       158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224,
       229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246,
       249, 261, 262, 263]

In [None]:
# Filtrar los registros en el DataFrame actual que tengan valores en manhattan_zones
df_filtrado = df[df['PUlocationID'].isin(manhattan_zones) & df['DOlocationID'].isin(manhattan_zones)]


In [None]:
df_filtrado

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
60,B00236,2018-08-01 00:40:18,2018-08-01 01:13:27,237.0,238.0,,B00236
83,B00254,2018-08-01 00:41:51,2018-08-01 01:09:52,161.0,24.0,,B00254
85,B00254,2018-08-01 00:10:22,2018-08-01 00:31:11,162.0,87.0,,B00254
86,B00254,2018-08-01 00:32:29,2018-08-01 00:59:33,87.0,262.0,,B00254
87,B00254,2018-08-01 00:45:20,2018-08-01 01:01:37,162.0,43.0,,B00254
...,...,...,...,...,...,...,...
22118709,B02889,2018-08-31 23:11:09,2018-08-31 23:32:23,144.0,48.0,,B02889
22118710,B02889,2018-08-31 23:36:12,2018-08-31 23:48:17,48.0,90.0,,B02889
22118719,B02889,2018-08-31 23:03:05,2018-08-31 23:17:45,233.0,142.0,,B02889
22118720,B02889,2018-08-31 23:20:28,2018-08-31 23:35:52,142.0,229.0,,B02889


In [None]:
df_filtrado.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6095325 entries, 60 to 22118722
Data columns (total 7 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   dispatching_base_num    object        
 1   pickup_datetime         datetime64[ns]
 2   dropOff_datetime        datetime64[ns]
 3   PUlocationID            float64       
 4   DOlocationID            float64       
 5   SR_Flag                 float64       
 6   Affiliated_base_number  object        
dtypes: datetime64[ns](2), float64(3), object(2)
memory usage: 372.0+ MB


In [None]:
df_filtrado

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
60,B00236,2018-08-01 00:40:18,2018-08-01 01:13:27,237.0,238.0,,B00236
83,B00254,2018-08-01 00:41:51,2018-08-01 01:09:52,161.0,24.0,,B00254
85,B00254,2018-08-01 00:10:22,2018-08-01 00:31:11,162.0,87.0,,B00254
86,B00254,2018-08-01 00:32:29,2018-08-01 00:59:33,87.0,262.0,,B00254
87,B00254,2018-08-01 00:45:20,2018-08-01 01:01:37,162.0,43.0,,B00254
...,...,...,...,...,...,...,...
22118709,B02889,2018-08-31 23:11:09,2018-08-31 23:32:23,144.0,48.0,,B02889
22118710,B02889,2018-08-31 23:36:12,2018-08-31 23:48:17,48.0,90.0,,B02889
22118719,B02889,2018-08-31 23:03:05,2018-08-31 23:17:45,233.0,142.0,,B02889
22118720,B02889,2018-08-31 23:20:28,2018-08-31 23:35:52,142.0,229.0,,B02889
