In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import json

In [13]:
plt.style.use('Solarize_Light2')
pd.set_option('display.max_columns', None)

In [14]:
with open('../../../utilities/configurations/lyft_conection.json', 'r') as file:
    credenciales_lyft = json.load(file)

engine = create_engine(f"mysql+pymysql://{credenciales_lyft['USERNAME']}:{credenciales_lyft['PASSWORD']}@{credenciales_lyft['SERVER']}/{credenciales_lyft['DATABASE']}",
                       connect_args={
                            'init_command': "SET SESSION net_read_timeout=600, net_write_timeout=600, max_execution_time=3000000"
                        })

In [15]:
start_date = '2022-11-01 00:00:00'
end_date = '2025-11-30 23:59:59'

In [16]:
sql_BikeSubscriptionFact = f'''

SELECT 
    id,
    purchase,
    member_accountNumber,
    purchasePrice,
    creditedAmount,
    subsidizeAmount,
    totalPaid,
    status_id,
    subscriptionType_id,
    paymentTerms_id,
    purchaseStation_id 
FROM BikeSubscriptionFact
WHERE 
	purchase BETWEEN UNIX_TIMESTAMP(CONVERT_TZ("{start_date}", "America/Mexico_City", 'UTC'))*1000
    AND UNIX_TIMESTAMP(CONVERT_TZ("{end_date}", "America/Mexico_City", 'UTC'))*1000;

'''

df_sql_BikeSubscriptionFact = pd.read_sql(sql_BikeSubscriptionFact, engine)


In [17]:
df_sql_BikeSubscriptionFact.head()

Unnamed: 0,id,purchase,member_accountNumber,purchasePrice,creditedAmount,subsidizeAmount,totalPaid,status_id,subscriptionType_id,paymentTerms_id,purchaseStation_id
0,78652,1667282400521,221077,449.14,0.0,0.0,521.0,5,4,0,
1,78653,1667282404573,222017,449.14,0.0,0.0,521.0,5,4,0,
2,78654,1667282406299,279374,449.14,0.0,0.0,521.0,5,4,0,
3,78655,1667282412014,411927,449.14,0.0,0.0,521.0,5,4,0,
4,78656,1667282415271,412196,449.14,0.0,0.0,521.0,5,4,0,


- id: Primary key
- purchase: Purchase date and time
- member_accountNumber: Member’s account number
- purchasePrice: Initial price of the subscription prior to processing installments and applying discount code.
- creditedAmount: Amount of the discount code redeemed for the subscription.
- subsidizeAmount: Subsidized amount for a subscription if it is associated with a program.
- totalPaid: Final price of the subscription after processing installments and applying discount code.
- status_id: Identifier of status of subscription (BikeSubscriptionStatusDim)
- subscriptionType_id: Identifier of the type of subscription (BikeSubscriptionTypeDim)
- paymentTerms_id: Identifier of the payment terms for the subscription. (BikeSubscriptionPaymentTermsDim)
- purchaseStation_id: Identifier of the station at which the purchase is made.

## Limpiar Data

In [18]:
#
df_sql_BikeSubscriptionFact.dtypes

id                        int64
purchase                  int64
member_accountNumber     object
purchasePrice           float64
creditedAmount          float64
subsidizeAmount         float64
totalPaid               float64
status_id                 int64
subscriptionType_id       int64
paymentTerms_id           int64
purchaseStation_id       object
dtype: object

In [19]:
df_sql_BikeSubscriptionFact['purchase'] = pd.to_datetime(df_sql_BikeSubscriptionFact['purchase'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('America/Mexico_City')
df_sql_BikeSubscriptionFact.head()

Unnamed: 0,id,purchase,member_accountNumber,purchasePrice,creditedAmount,subsidizeAmount,totalPaid,status_id,subscriptionType_id,paymentTerms_id,purchaseStation_id
0,78652,2022-11-01 00:00:00.521000-06:00,221077,449.14,0.0,0.0,521.0,5,4,0,
1,78653,2022-11-01 00:00:04.573000-06:00,222017,449.14,0.0,0.0,521.0,5,4,0,
2,78654,2022-11-01 00:00:06.299000-06:00,279374,449.14,0.0,0.0,521.0,5,4,0,
3,78655,2022-11-01 00:00:12.014000-06:00,411927,449.14,0.0,0.0,521.0,5,4,0,
4,78656,2022-11-01 00:00:15.271000-06:00,412196,449.14,0.0,0.0,521.0,5,4,0,


In [20]:
# obtener dia, mes, año, hora, hora_minuto_segundo, dia_semana, semana_año, mes_año, semana_mes, trimestre_año
df_sql_BikeSubscriptionFact['day'] = df_sql_BikeSubscriptionFact['purchase'].dt.day
df_sql_BikeSubscriptionFact['month'] = df_sql_BikeSubscriptionFact['purchase'].dt.month
df_sql_BikeSubscriptionFact['year'] = df_sql_BikeSubscriptionFact['purchase'].dt.year
df_sql_BikeSubscriptionFact['hour'] = df_sql_BikeSubscriptionFact['purchase'].dt.hour
df_sql_BikeSubscriptionFact['hour_minute_second'] = df_sql_BikeSubscriptionFact['purchase'].dt.time
df_sql_BikeSubscriptionFact['week_of_year'] = df_sql_BikeSubscriptionFact['purchase'].dt.isocalendar().week
df_sql_BikeSubscriptionFact['month_of_year'] = df_sql_BikeSubscriptionFact['purchase'].dt.month
df_sql_BikeSubscriptionFact['week_of_month'] = df_sql_BikeSubscriptionFact['purchase'].dt.day.apply(lambda d: (d - 1) // 7 + 1)
df_sql_BikeSubscriptionFact['quarter_of_year'] = df_sql_BikeSubscriptionFact['purchase'].dt.quarter
df_sql_BikeSubscriptionFact.head()

Unnamed: 0,id,purchase,member_accountNumber,purchasePrice,creditedAmount,subsidizeAmount,totalPaid,status_id,subscriptionType_id,paymentTerms_id,purchaseStation_id,day,month,year,hour,hour_minute_second,week_of_year,month_of_year,week_of_month,quarter_of_year
0,78652,2022-11-01 00:00:00.521000-06:00,221077,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:00.521000,44,11,1,4
1,78653,2022-11-01 00:00:04.573000-06:00,222017,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:04.573000,44,11,1,4
2,78654,2022-11-01 00:00:06.299000-06:00,279374,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:06.299000,44,11,1,4
3,78655,2022-11-01 00:00:12.014000-06:00,411927,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:12.014000,44,11,1,4
4,78656,2022-11-01 00:00:15.271000-06:00,412196,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:15.271000,44,11,1,4


In [21]:
# renombrar todas las columnas a español
df_sql_BikeSubscriptionFact.rename(columns={
    'id': 'id',
    'purchase': 'fecha_compra',
    'member_accountNumber': 'numero_cuenta_miembro',
    'purchasePrice': 'precio_compra',
    'creditedAmount': 'monto_acreditado',
    'subsidizeAmount': 'monto_subsidiado',
    'totalPaid': 'total_pagado',
    'status_id': 'id_estado',
    'subscriptionType_id': 'id_tipo_suscripcion',
    'paymentTerms_id': 'id_terminos_pago',
    'purchaseStation_id': 'id_estacion_compra',
    'day': 'dia',
    'month': 'mes',
    'year': 'año',
    'hour': 'hora',
    'hour_minute_second': 'hora_minuto_segundo',
    'week_of_year': 'semana_año',
    'month_of_year': 'mes_año',
    'week_of_month': 'semana_mes',
    'quarter_of_year': 'trimestre_año'
}, inplace=True)
df_sql_BikeSubscriptionFact.head()

Unnamed: 0,id,fecha_compra,numero_cuenta_miembro,precio_compra,monto_acreditado,monto_subsidiado,total_pagado,id_estado,id_tipo_suscripcion,id_terminos_pago,id_estacion_compra,dia,mes,año,hora,hora_minuto_segundo,semana_año,mes_año,semana_mes,trimestre_año
0,78652,2022-11-01 00:00:00.521000-06:00,221077,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:00.521000,44,11,1,4
1,78653,2022-11-01 00:00:04.573000-06:00,222017,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:04.573000,44,11,1,4
2,78654,2022-11-01 00:00:06.299000-06:00,279374,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:06.299000,44,11,1,4
3,78655,2022-11-01 00:00:12.014000-06:00,411927,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:12.014000,44,11,1,4
4,78656,2022-11-01 00:00:15.271000-06:00,412196,449.14,0.0,0.0,521.0,5,4,0,,1,11,2022,0,00:00:15.271000,44,11,1,4


In [22]:
df_sql_BikeSubscriptionFact.to_csv('../data/bike_subscription_fact_cleaned.csv', index=False)