In [1]:
import pandas as pd
import numpy as np
import random

from tqdm.notebook import tqdm
from geopy.distance import geodesic

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
data = pd.read_csv('London postcodes.csv', delimiter=",")

In [31]:
data.shape

(314746, 29)

In [32]:
data.head()

Unnamed: 0,Postcode,In Use?,Latitude,Longitude,Easting,Northing,GridRef,County,District,Ward,...,Population,Households,Built up area,Built up sub-division,Lower layer super output area,Rural/urban,Region,Altitude,London zone,LSOA Code
0,BR1 1AA,Yes,51.401546,0.015415,540291,168873,TQ402688,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,71,5.0,E01000675
1,BR1 1AB,Yes,51.406333,0.015208,540262,169405,TQ402694,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 008B,Urban major conurbation,London,71,4.0,E01000676
2,BR1 1AD,Yes,51.400057,0.016715,540386,168710,TQ403687,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,53,5.0,E01000675
3,BR1 1AE,Yes,51.404543,0.014195,540197,169204,TQ401692,Greater London,Bromley,Bromley Town,...,34.0,21.0,Greater London,Bromley,Bromley 018C,Urban major conurbation,London,71,4.0,E01000677
4,BR1 1AF,Yes,51.401392,0.014948,540259,168855,TQ402688,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,58,5.0,E01000675


In [33]:
data.columns

Index(['Postcode', 'In Use?', 'Latitude', 'Longitude', 'Easting', 'Northing',
       'GridRef', 'County', 'District', 'Ward', 'DistrictCode', 'WardCode',
       'Country', 'CountyCode', 'Constituency', 'Introduced', 'Terminated',
       'Parish', 'NationalPark', 'Population', 'Households', 'Built up area',
       'Built up sub-division', 'Lower layer super output area', 'Rural/urban',
       'Region', 'Altitude', 'London zone', 'LSOA Code'],
      dtype='object')

In [50]:
rides = pd.DataFrame(columns=['driver_id', 'client_id', 'start', 'start_latitude', 'start_longtitude', 'finish', 'finish_latitude', 'finish_longtitude', 'distance', 'road_time', 'start_time', 'finish_time', 'cost', 'driver_rate', 'category_driver_feedback', 'text_driver_feedback', 'client_rate', 'category_client_feedback', 'text_client_feedback'])
RIDES_AMOUNT = 2000000

Generating drivers and clients id's

In [51]:
rides['driver_id'] = np.random.randint(low=0, high=2000, size=RIDES_AMOUNT)
rides['client_id'] = np.random.randint(low=0, high=4000, size=RIDES_AMOUNT)

Generating start and finish points

In [52]:
rides[['start', 'start_latitude', 'start_longtitude']] = data[['Postcode', 'Latitude', 'Longitude']].sample(n=RIDES_AMOUNT, replace=True).reset_index(drop=True)
rides[['finish', 'finish_latitude', 'finish_longtitude']] = data[['Postcode', 'Latitude', 'Longitude']].sample(n=RIDES_AMOUNT, replace=True).reset_index(drop=True)

Generating start time

In [53]:
def randomDates(start, end, n=10):
    start_d = start.value//10**9
    end_d = end.value//10**9
    return pd.to_datetime(np.random.randint(start_d, end_d, n), unit='s')

start = pd.to_datetime('2012-01-01')
end = pd.to_datetime('2023-01-01')
rides['start_time'] = randomDates(start, end, RIDES_AMOUNT)

Start-finish distances

In [56]:
rides['distance'] = [geodesic((x1, y1), (x2, y2)).km for x1, y1, x2, y2 in tqdm(zip(rides['start_latitude'], rides['start_longtitude'], rides['finish_latitude'], rides['finish_longtitude']), total=RIDES_AMOUNT)]
rides['distance'] = rides['distance'].round(2)

  0%|          | 0/2000000 [00:00<?, ?it/s]

Calculate road time

In [57]:
rides['road_time'] = abs(np.random.normal(size=RIDES_AMOUNT, scale=10)) + rides['distance'] * abs(np.random.normal(size=RIDES_AMOUNT, loc=1, scale=0.25))
rides['road_time'] = rides['road_time'].astype('int')
rides['road_time'] = pd.to_timedelta(rides['road_time'], unit='m')

Calculate finish time

In [58]:
rides['finish_time'] = rides['start_time'] + rides['road_time']

Calculate cost of the ride

In [59]:
def costCalculating(start_time, distance):
    cost = 2 + 0.5 * distance
    if (start_time.hour >= 8 and start_time.hour <= 9) or \
        (start_time.hour >= 18 and start_time.hour <= 19):
        cost *= 1.5
    if (start_time.hour >= 22 or start_time.hour <= 6):
        cost *= 1.3
    return cost
    
rides['cost'] = [costCalculating(s, d) for s, d in tqdm(zip(rides.start_time, rides.distance), total=RIDES_AMOUNT)]
rides['cost'] = rides['cost'].round(2)

  0%|          | 0/2000000 [00:00<?, ?it/s]

Generating drivers rates

In [60]:
driver_rate_idx = np.random.randint(low=0, high=RIDES_AMOUNT, size=int(RIDES_AMOUNT*0.3))
driver_rate_distribution_arr = np.random.multinomial(1, [0.2, 0.05, 0.1, 0.25, 0.4], size=int(RIDES_AMOUNT*0.3))
rides['driver_rate'][driver_rate_idx] = np.where(driver_rate_distribution_arr == 1)[1] + 1

In [61]:
driver_feedback_categories_good = ['great service', 'nice car', 'wonderful companion', 'neat and tidy', 'expert navigation', 'recommend']
driver_feedback_categories_bad = ['awful service', 'bad car', 'unpleasant companion', 'dirty', 'non-expert navigation', 'not recommend']

In [62]:
category_driver_good_feedback_idx = np.random.choice(rides[rides.driver_rate > 3].index, size=int(RIDES_AMOUNT*0.3*0.2))
rides["category_driver_feedback"][category_driver_good_feedback_idx] = np.random.choice(driver_feedback_categories_good, size=int(RIDES_AMOUNT*0.3*0.2))

category_driver_bad_feedback_idx = np.random.choice(rides[rides.driver_rate < 4].index, size=int(RIDES_AMOUNT*0.3*0.2))
rides["category_driver_feedback"][category_driver_bad_feedback_idx] = np.random.choice(driver_feedback_categories_bad, size=int(RIDES_AMOUNT*0.3*0.2))

In [63]:
text_good_feedback_driver_length = np.random.randint(low=0, high=7, size=int(RIDES_AMOUNT*0.3*0.2))
text_good_feedback_driver_sample = [random.sample(driver_feedback_categories_good, i) for i in text_good_feedback_driver_length]
rides['text_driver_feedback'][category_driver_good_feedback_idx] = text_good_feedback_driver_sample

text_bad_feedback_driver_length = np.random.randint(low=0, high=7, size=int(RIDES_AMOUNT*0.3*0.2))
text_bad_feedback_driver_sample = [random.sample(driver_feedback_categories_bad, i) for i in text_bad_feedback_driver_length]
rides['text_driver_feedback'][category_driver_bad_feedback_idx] = text_bad_feedback_driver_sample

Generating clients rates 

In [64]:
client_rate_idx = np.random.randint(low=0, high=RIDES_AMOUNT, size=int(RIDES_AMOUNT*0.5))
client_rate_distribution_arr = np.random.multinomial(1, [0.2, 0.05, 0.1, 0.25, 0.4], size=int(RIDES_AMOUNT*0.5))
rides['client_rate'][client_rate_idx] = np.where(client_rate_distribution_arr == 1)[1] + 1

In [65]:
client_feedback_categories_good = ['polite', 'pleasant', 'quiet', 'neat and tidy', 'recommend']
client_feedback_categories_bad = ['unpolite', 'unpleasant', 'loud', 'dirty','not recommend']

In [66]:
category_client_good_feedback_idx = np.random.choice(rides[rides.client_rate > 3].index, size=int(RIDES_AMOUNT*0.3*0.2))
rides["category_client_feedback"][category_client_good_feedback_idx] = np.random.choice(client_feedback_categories_good, size=int(RIDES_AMOUNT*0.3*0.2))

category_client_bad_feedback_idx = np.random.choice(rides[rides.client_rate < 4].index, size=int(RIDES_AMOUNT*0.3*0.2))
rides["category_client_feedback"][category_client_bad_feedback_idx] = np.random.choice(client_feedback_categories_bad, size=int(RIDES_AMOUNT*0.3*0.2))

In [67]:
text_good_feedback_client_length = np.random.randint(low=0, high=6, size=int(RIDES_AMOUNT*0.3*0.2))
text_good_feedback_client_sample = [random.sample(client_feedback_categories_good, i) for i in text_good_feedback_client_length]
rides['text_client_feedback'][category_client_good_feedback_idx] = text_good_feedback_client_sample

text_bad_feedback_client_length = np.random.randint(low=0, high=6, size=int(RIDES_AMOUNT*0.3*0.2))
text_bad_feedback_client_sample = [random.sample(client_feedback_categories_good, i) for i in text_bad_feedback_client_length]
rides['text_client_feedback'][category_client_good_feedback_idx] = text_bad_feedback_client_sample

In [68]:
rides.head()

Unnamed: 0,driver_id,client_id,start,start_latitude,start_longtitude,finish,finish_latitude,finish_longtitude,distance,road_time,start_time,finish_time,cost,driver_rate,category_driver_feedback,text_driver_feedback,client_rate,category_client_feedback,text_client_feedback
0,1357,52,SW18 2FN,51.448231,-0.191526,SM4 6LP,51.386111,-0.198575,6.93,0 days 00:27:00,2021-06-24 23:20:05,2021-06-24 23:47:05,7.1,,,,,,
1,1453,3104,W1W 5PP,51.523789,-0.143961,W1B 4JZ,51.512585,-0.140598,1.27,0 days 00:24:00,2014-04-05 11:23:31,2014-04-05 11:47:31,2.64,,,,1.0,,
2,1506,1865,N2 0JZ,51.589306,-0.181742,SW19 6LL,51.445,-0.207209,16.15,0 days 00:23:00,2020-11-23 17:33:11,2020-11-23 17:56:11,10.07,2.0,bad car,"[not recommend, dirty, non-expert navigation]",3.0,,
3,508,33,N3 3RN,51.597496,-0.199102,NW10 6ZJ,51.528389,-0.246594,8.36,0 days 00:13:00,2012-07-10 08:11:54,2012-07-10 08:24:54,9.27,,,,,,
4,1048,543,SW16 9EY,51.430919,-0.130694,DA1 4FF,51.459906,0.179389,21.8,0 days 00:31:00,2017-07-23 08:02:25,2017-07-23 08:33:25,19.35,4.0,,,5.0,,


In [69]:
rides.to_csv("rides.csv")