In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import seaborn as sns
import json

In [80]:
plt.style.use('seaborn-v0_8-pastel')
pd.set_option('display.max_columns', None)

In [81]:
with open('../config/lyft_conection.json', 'r') as file:
    credenciales_lyft = json.load(file)

In [82]:

engine = create_engine(f"mysql+pymysql://{credenciales_lyft['USERNAME']}:{credenciales_lyft['PASSWORD']}@{credenciales_lyft['SERVER']}/{credenciales_lyft['DATABASE']}")

In [83]:
sql_travels = f'''

SELECT
    member_accountNumber,
    startStation_id,
    endStation_id,
    duration,
    bike_id,
    subscriptionId,
    totalDurationMs,
    startTimeMs,
    endTimeMs
FROM BikeRentalFact
WHERE 
    (startTimeMs BETWEEN UNIX_TIMESTAMP(CONVERT_TZ("2025-01-01 00:00:00", "America/Mexico_City", 'UTC'))*1000 
    AND UNIX_TIMESTAMP(CONVERT_TZ("2025-12-31 23:59:59", "America/Mexico_City", 'UTC'))*1000)
    AND NOT ((startStation_id = endStation_id) AND (duration/60 < 2))
LIMIT 10000;

'''

df_travels = pd.read_sql(sql_travels, engine)
df_travels

Unnamed: 0,member_accountNumber,startStation_id,endStation_id,duration,bike_id,subscriptionId,totalDurationMs,startTimeMs,endTimeMs
0,S33JUHC9,356,530,1351,6431,452082,1351076,1735711214114,1735712565190
1,DD2W7K22,465,292,258,1749,683345,258537,1735711269604,1735711528141
2,AZ3M3VMZ,540,456,1157,6212,419168,1157726,1735711249565,1735712407291
3,B2YB53P3,349,349,925,9274,670236,925588,1735711250636,1735712176224
4,A4QAJUTK,265,418,1548,3370,353671,1548125,1735711253272,1735712801397
...,...,...,...,...,...,...,...,...,...
9995,W6NDUNKU,152,296,2530,1553,436364,2530488,1735773111740,1735775642228
9996,C37C8DPV,269,101,1729,1852,690643,1729407,1735773126406,1735774855813
9997,AE3MYG3R,339,363,648,9290,690590,648501,1735773147347,1735773795848
9998,3ERY9VN7,269,101,1705,3326,690006,1705507,1735773117006,1735774822513


In [84]:
sql_stations = f'''

SELECT 
    id,
    name,
    longitude,
    latitude
FROM BikeStationDim;

'''

df_stations_sql = pd.read_sql(sql_stations, engine)
df_stations_sql.head()

Unnamed: 0,id,name,longitude,latitude
0,1,CE-710 Molino del Rey - Glorieta de la Lealtad,-99.192508,19.416795
1,2,DELETED,-73.5690450668335,45.5059375789777
2,3,MEX Warehouse Test Station,-99.18185421964152,19.443094933698728
3,4,XDEV-DELETE-2,0.0,0.0
4,5,CE-407 Prolongación Xochicalco-General Emilia...,-99.15865559132271,19.36726591257277


In [85]:
# Merge de dataframes para obtener la información de las estaciones de inicio y fin
df_stations_start = df_stations_sql.add_prefix('start_')
df_stations_end = df_stations_sql.add_prefix('end_')
df_travels = df_travels.merge(df_stations_start, left_on='startStation_id', right_on='start_id', how='left')
df_travels = df_travels.merge(df_stations_end, left_on='endStation_id', right_on='end_id', how='left')
df_travels.head(2)

Unnamed: 0,member_accountNumber,startStation_id,endStation_id,duration,bike_id,subscriptionId,totalDurationMs,startTimeMs,endTimeMs,start_id,start_name,start_longitude,start_latitude,end_id,end_name,end_longitude,end_latitude
0,S33JUHC9,356,530,1351,6431,452082,1351076,1735711214114,1735712565190,356,CE-052 Hidalgo - Trujano,-99.141931,19.436208,530,CE-620 Fernando Montes de Oca - Priv. Lago,-99.142997,19.381804
1,DD2W7K22,465,292,258,1749,683345,258537,1735711269604,1735711528141,465,CE-004 Río Nilo - Río Panuco,-99.171693,19.428491,292,CE-001 Río Sena-Río Balsas,-99.16778985412292,19.43347819859695


In [86]:
df_travels['start_datetime'] = pd.to_datetime(df_travels['startTimeMs'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('America/Mexico_City')
df_travels['end_datetime'] = pd.to_datetime(df_travels['endTimeMs'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('America/Mexico_City')
df_travels.head(2)

Unnamed: 0,member_accountNumber,startStation_id,endStation_id,duration,bike_id,subscriptionId,totalDurationMs,startTimeMs,endTimeMs,start_id,start_name,start_longitude,start_latitude,end_id,end_name,end_longitude,end_latitude,start_datetime,end_datetime
0,S33JUHC9,356,530,1351,6431,452082,1351076,1735711214114,1735712565190,356,CE-052 Hidalgo - Trujano,-99.141931,19.436208,530,CE-620 Fernando Montes de Oca - Priv. Lago,-99.142997,19.381804,2025-01-01 00:00:14.114000-06:00,2025-01-01 00:22:45.190000-06:00
1,DD2W7K22,465,292,258,1749,683345,258537,1735711269604,1735711528141,465,CE-004 Río Nilo - Río Panuco,-99.171693,19.428491,292,CE-001 Río Sena-Río Balsas,-99.16778985412292,19.43347819859695,2025-01-01 00:01:09.604000-06:00,2025-01-01 00:05:28.141000-06:00


In [87]:
df_travels['id_ruta'] = df_travels['startStation_id'].astype(str) + '_' + df_travels['endStation_id'].astype(str)
df_travels['ruta'] = 'De ' + df_travels['start_name'] + ' a ' + df_travels['end_name']
df_travels['order_start'] = 1
df_travels['order_end'] = 2
df_travels.head(2)

Unnamed: 0,member_accountNumber,startStation_id,endStation_id,duration,bike_id,subscriptionId,totalDurationMs,startTimeMs,endTimeMs,start_id,start_name,start_longitude,start_latitude,end_id,end_name,end_longitude,end_latitude,start_datetime,end_datetime,id_ruta,ruta,order_start,order_end
0,S33JUHC9,356,530,1351,6431,452082,1351076,1735711214114,1735712565190,356,CE-052 Hidalgo - Trujano,-99.141931,19.436208,530,CE-620 Fernando Montes de Oca - Priv. Lago,-99.142997,19.381804,2025-01-01 00:00:14.114000-06:00,2025-01-01 00:22:45.190000-06:00,356_530,De CE-052 Hidalgo - Trujano a CE-620 Fernando ...,1,2
1,DD2W7K22,465,292,258,1749,683345,258537,1735711269604,1735711528141,465,CE-004 Río Nilo - Río Panuco,-99.171693,19.428491,292,CE-001 Río Sena-Río Balsas,-99.16778985412292,19.43347819859695,2025-01-01 00:01:09.604000-06:00,2025-01-01 00:05:28.141000-06:00,465_292,De CE-004 Río Nilo - Río Panuco a CE-001 Río S...,1,2


In [88]:
nodes_start = df_travels[['member_accountNumber','start_id', 'start_name', 'start_longitude', 'start_latitude', 'id_ruta', 'order_start']].copy()
nodes_start.columns = ['member_accountNumber','station_id', 'station_name', 'longitude', 'latitude', 'id_ruta', 'order']
nodes_start['node_type'] = 'start'

# Crear nodos de fin
nodes_end = df_travels[['member_accountNumber','end_id', 'end_name', 'end_longitude', 'end_latitude', 'id_ruta', 'order_end']].copy()
nodes_end.columns = ['member_accountNumber','station_id', 'station_name', 'longitude', 'latitude', 'id_ruta', 'order']
nodes_end['node_type'] = 'end'

# Combinar nodos de inicio y fin
nodes = pd.concat([nodes_start, nodes_end]).reset_index(drop=True)
nodes.head()

Unnamed: 0,member_accountNumber,station_id,station_name,longitude,latitude,id_ruta,order,node_type
0,S33JUHC9,356,CE-052 Hidalgo - Trujano,-99.141931,19.436208,356_530,1,start
1,DD2W7K22,465,CE-004 Río Nilo - Río Panuco,-99.171693,19.428491,465_292,1,start
2,AZ3M3VMZ,540,CE-560 Doctor Liceaga - Eje Central,-99.143408,19.421599,540_456,1,start
3,B2YB53P3,349,CE-549 Sor Juana Ines de la Cruz - Dr. Atl,-99.1571333,19.4450444,349_349,1,start
4,A4QAJUTK,265,CE-023 Reforma - Praga,-99.171099,19.42528,265_418,1,start


In [96]:
nodes.to_csv('nodos_viajes.csv', index=False)