In [6]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Configuración del generador de datos falsos
fake = Faker()

# Crear usuarios
users = [(i, fake.name(), fake.unique.random_number(digits=8, fix_len=True)) for i in range(1000)]
users_df = pd.DataFrame(users, columns=["user_id", "name_user", "rut_user"])

# Crear vehículos
vehicles = [(i, fake.random_element(elements=('Sedan', 'SUV', 'Hatchback', 'Pickup')), fake.year()) for i in range(100)]
vehicles_df = pd.DataFrame(vehicles, columns=["vehicle_id", "vehicle_type", "vehicle_year"])

# Crear viajes
trips = []
for i in range(10000):
    user = users_df.sample().iloc[0]
    vehicle = vehicles_df.sample().iloc[0]
    price_amount = int(np.random.uniform(1000, 30000))
    price_tax = price_amount * 0.19
    price_total = int(price_amount + price_tax)
    start_time = fake.date_time_between_dates(datetime_start=pd.to_datetime('2022-01-01'), datetime_end=pd.to_datetime('2023-01-01'))
    end_time = start_time + pd.Timedelta(minutes=random.randint(10, 24*60))
    booking_time = start_time - pd.Timedelta(minutes=random.randint(1, 15))
    start_lat = np.random.uniform(-33.5, -34.5)
    start_lon = np.random.uniform(-70.5, -71.5)
    end_lat = np.random.uniform(-33.5, -34.5)
    end_lon = np.random.uniform(-70.5, -71.5)
    
    trips.append([
        i, user["user_id"], user["name_user"], user["rut_user"], vehicle["vehicle_id"], booking_time,
        start_time, end_time, fake.random_element(elements=(1,2,3,4,5,6)), int(np.random.uniform(100, 100000)),
        fake.random_element(elements=(1,2,3)), price_amount, price_tax, price_total, start_lat, start_lon,
        end_lat, end_lon
    ])

trips_df = pd.DataFrame(trips, columns=[
    "trip_id", "user_id", "name_user", "rut_user", "vehicle_id", "booking_time",
    "start_time", "end_time", "status_id", "travel_dist",
    "membership_id", "price_amount", "price_tax", "price_total", "start_lat",
    "start_lon", "end_lat", "end_lon"
])


In [7]:
trips_df.to_csv('trips.csv', index=False)

In [8]:
trips_df

Unnamed: 0,trip_id,user_id,name_user,rut_user,vehicle_id,booking_time,start_time,end_time,status_id,travel_dist,membership_id,price_amount,price_tax,price_total,start_lat,start_lon,end_lat,end_lon
0,0,796,Tanya Barron,12507013,37,2022-08-04 19:02:55,2022-08-04 19:16:55,2022-08-05 17:43:55,6,29710,1,24117,4582.23,28699,-33.707709,-70.748669,-34.223779,-70.838174
1,1,963,Valerie Serrano,66945220,4,2022-10-11 21:49:09,2022-10-11 22:00:09,2022-10-12 13:43:09,4,65402,1,23489,4462.91,27951,-34.313279,-70.853196,-34.444853,-71.127732
2,2,261,Kara Reyes,66327332,81,2022-11-19 00:18:15,2022-11-19 00:21:15,2022-11-19 23:46:15,4,65681,2,2781,528.39,3309,-33.627854,-70.612821,-33.914429,-70.650521
3,3,268,Megan Hughes,83985539,63,2022-07-20 17:02:58,2022-07-20 17:06:58,2022-07-20 19:55:58,4,14031,2,9037,1717.03,10754,-34.475023,-71.305649,-34.369743,-70.618111
4,4,712,Shane Cain,16916542,16,2022-07-16 09:01:06,2022-07-16 09:13:06,2022-07-16 19:31:06,6,30351,3,26469,5029.11,31498,-33.993025,-71.449659,-34.099402,-71.138914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,282,David Shepard,62388013,58,2022-07-08 15:12:52,2022-07-08 15:13:52,2022-07-08 18:53:52,1,33165,3,19535,3711.65,23246,-34.367133,-70.700291,-33.772864,-71.309506
9996,9996,894,Jeremy Hooper,86007156,63,2022-11-24 02:51:20,2022-11-24 03:04:20,2022-11-24 18:08:20,6,40188,3,27958,5312.02,33270,-33.505628,-70.773155,-34.271872,-70.536114
9997,9997,407,Sherry Perez,43485470,19,2022-01-04 17:23:36,2022-01-04 17:32:36,2022-01-05 16:28:36,5,11901,2,27318,5190.42,32508,-34.374940,-71.063867,-34.379142,-70.566617
9998,9998,908,Lindsey Gallegos,28276158,96,2022-09-25 13:43:45,2022-09-25 13:49:45,2022-09-26 00:22:45,2,11277,2,27541,5232.79,32773,-34.434681,-71.160620,-33.818820,-70.910235


### Verificar datos null


In [9]:
trips_df.isnull().sum()

trip_id          0
user_id          0
name_user        0
rut_user         0
vehicle_id       0
booking_time     0
start_time       0
end_time         0
status_id        0
travel_dist      0
membership_id    0
price_amount     0
price_tax        0
price_total      0
start_lat        0
start_lon        0
end_lat          0
end_lon          0
dtype: int64

verificar datos Identificadores unicos

In [21]:
# Columna que deseas verificar
def verifica_repetidos(df, columna):
    # Verifica si hay valores repetidos en la columna
    hay_repetidos = df[columna].duplicated().any()

    if hay_repetidos:
        print(f"La columna '{columna}' contiene valores repetidos.")
    else:
        print(f"La columna '{columna}' no contiene valores repetidos.")

# Lista de columnas a verificar:
list_columnas = ['trip_id', 'user_id', 'vehicle_id', 'status_id', 'membership_id']

# Iterar sobre la lista de columnas y verificar cada una
for columna in list_columnas:
    verifica_repetidos(trips_df, columna)


#la mayotia de las columnmas pk tienen datos repetidos, esto genera un problema por ser pk


La columna 'trip_id' no contiene valores repetidos.
La columna 'user_id' contiene valores repetidos.
La columna 'vehicle_id' contiene valores repetidos.
La columna 'status_id' contiene valores repetidos.
La columna 'membership_id' contiene valores repetidos.


In [31]:
trips_df['booking_time'].unique()

<DatetimeArray>
['2022-08-04 19:02:55', '2022-10-11 21:49:09', '2022-11-19 00:18:15',
 '2022-07-20 17:02:58', '2022-07-16 09:01:06', '2022-09-14 14:40:54',
 '2022-06-11 03:32:59', '2022-06-24 08:38:16', '2022-05-06 19:58:30',
 '2022-01-31 15:25:22',
 ...
 '2022-03-12 20:11:28', '2022-07-26 23:00:55', '2022-03-14 00:35:46',
 '2022-05-25 09:00:28', '2022-10-02 17:49:36', '2022-07-08 15:12:52',
 '2022-11-24 02:51:20', '2022-01-04 17:23:36', '2022-09-25 13:43:45',
 '2022-02-28 01:28:34']
Length: 9996, dtype: datetime64[ns]