In [1]:
# Editor: Tearsyu
# PDS
# Generate history records
# Estimate number : 123774/356 = 347
import pandas as pd
import datetime
import random
import math

# Constraints: - 3000 clients (how many active clients?) and 173 parkings
#              - Paris accepts largest numbers of visiters in July, like a normal distribution
#              - At a common day, 00-06 clock there is less use times, from 7 to 8, 12 to 13, 19 to 20, there is more use times
#              - We need a bundle of high random data with the constraints above.


clients = 3000
parkings = 173
vehicles = parkings * 11
# weathers is a bulk of 2017 weather data of Paris
weathers = pd.read_csv("export-paris2017.csv", sep=',')
# getOneDayData(day, base_time_day) takes 2 arguments
# %day% the datetime like 2018-09-10
# %base_time_day% is the basic times of one day, according to our vision doc, this value is 347
def getOneDayData(day, base_time_day):
    currHours = day.hour
    currMonth = day.month
    # seedSign is used to add a random seed of times, some days there is more use times while some days less.
    seedSign = random.choice(["+", "-"]);
    hourTimes = 0
    res = []
    histId = 1
    precip_stat, visib_stat = getWeatherSeed(day)
    seedBaseHourTimes = int(base_time_day/24)
    for currHours in range(0, 24):
        if seedSign == '+':
            # baseHourTimes gives a basic use times of one hour, it ranges from 347/12-squart(347/12) to 347/12+squart(347/12)
            baseHourTimes = seedBaseHourTimes + random.randint(0, seedBaseHourTimes + int(math.sqrt(seedBaseHourTimes)))
        else:
            baseHourTimes = seedBaseHourTimes - random.randint(0, seedBaseHourTimes - int(math.sqrt(seedBaseHourTimes)))
        
        # taux is an other random seed of ever hour, from 0.1 to 0.3 or from 1.4 to 3.5, 
        # this data comes from google map, the metro use times in every hour 
        if (currHours >= 0 and currHours <= 6):
            taux = random.uniform(0.1, 0.3)
            hourTimes = int(baseHourTimes * taux)
            
        elif(currHours in [7, 8, 12, 13, 19, 20]):
            taux = random.uniform(1.2, 3.1)
            hourTimes = baseHourTimes + int(taux * baseHourTimes)
        else:
            hourTimes = baseHourTimes
        
        #print("hourTimes is " + str(hourTimes) + " at currHours is " + str(currHours))
    
        for i in range(0, hourTimes):
            userId = random.randint(1, clients)
            depId = random.randint(1, parkings)
            arrId = random.choice([i for i in range(1, parkings) if i not in [depId]])
            vehicleId = random.randint(1, vehicles)
            depDateTime, arrDateTime = getTravelTime(day, currHours)
            lateMin = getLateMin(depDateTime, arrDateTime,  precip_stat, visib_stat)
            travelTime = arrDateTime - depDateTime
            basePrice, supPrice = getPrice(lateMin, travelTime)
            oneTube = [histId, userId, depId, arrId, vehicleId, depDateTime, arrDateTime, lateMin, basePrice, supPrice]
            res.append(oneTube)
            histId = histId + 1
    #print("res length : " + str(len(res)) + " index is " + str(histId))
    return res
# generate a random traval's start time and it's duration         
def getTravelTime(day, currHours):
    depMin = random.randint(0, 59)
    duration = random.randint(1, 60)
    depDateTime = day + datetime.timedelta(hours=currHours, minutes = depMin)
    arrDateTime = depDateTime + datetime.timedelta(minutes = duration)
    return depDateTime, arrDateTime;

# generate if a travel is late, the possibility is 85%(maybe here need to correct..)
# I add some factories to influence the late possibility, if it's july or august, if it's in 8-9h, 18-19h
# there may be some transport problem.. 
def getLateMin(depDateTime, arrDateTime, precip_stat, visib_stat):
    seed = random.randint(1, 1000)
    seed = seed + precip_stat + visib_stat
    if depDateTime.month in [7, 8]:
        seed = seed + 80
    if depDateTime.hour in [8, 9, 18, 19]:
        seed = seed + 80
    if seed > 990 :
        minutes = arrDateTime - depDateTime;
        minutes = int((minutes.total_seconds()/60)*seed/1000)
        #print(minutes)
        if minutes == 0:
            minutes = 2
        return random.randint(1, minutes)
    else: 
        return 0
# calculate a price for a trip
def getPrice(lateMin, travelTime):
    travelTime = int(travelTime.total_seconds()/60)
    seedSign = random.choice(["+", "-"])
    if seedSign is "+" :
        basePrice = travelTime * 1.00 - random.uniform(0.3, 1.2) * travelTime / 4
    else :
        basePrice = travelTime * 1.00 - random.uniform(0.3, 1.2) * travelTime / 2
    if lateMin == 0:
        return round(basePrice, 2), 0;
    else :
        return round(basePrice, 2), lateMin/2;

# This function take current date time as argument, and return a possibility of late
# A late travel depend on PRECIP_TOTAL_DAY_MM and VISIBILITY_AVG_KM
# for ex: if PRECIP_TOTAL_DAY_MM is greater than 2mm, this travel has more 30% possibility to be late and go on
def getWeatherSeed(currDate):
    currDate = datetime.datetime.strftime(currDate,'%Y-%m-%d')
    getLine = weathers.loc[weathers['DATE'] == currDate]
    precipitation = float(getLine['PRECIP_TOTAL_DAY_MM'])
    visibility = float(getLine['VISIBILITY_AVG_KM'])
    precip_stat = 0
    visib_stat =0
    #print(str(precipitation) + " | " + str(visibility))
    if precipitation > 0.2 and precipitation < 1.0:
        precip_stat = 50
    elif precipitation >= 1.0 and precipitation < 3.0:
        precip_stat = 100 
    elif precipitation >= 3.0 and precipitation < 5.0:
        precip_stat = 220
    elif precipitation >= 5.0:
        precip_stat = 350
    
    if visibility < 8.0 and visibility > 6.0:
        visib_stat = 50
    elif visibility <= 6.0 and visibility > 5.0:
        visib_stat = 100
    elif visibility <= 5.0 :
        visib_stat = 170
    return precip_stat, visib_stat    

    
    
def generateYearHistory(base_time_day, year):
    cols = ["id", "client_id", "departure_id", "arrival_id","vehicle_id", "dep_time", "arr_time",
            "late_time", "base_price", "sup_price"]
    
    oneYearHist = []
    oneDayHist = []
    currMon = 1
    currDate = 1
    currDay = datetime.datetime(year, currMon, currDate)
    seedSign = random.choice(["+", "-"]);
        
    for i in range(0, 356):
        #print("now at " + str(currDay) + " month is " + str(currDay.month))
        if currDay.month in [7, 8]:
            currBaseTimeDay = base_time_day + int(random.uniform(2, 4) * math.sqrt(base_time_day))
        else:
            if seedSign is "+":
                currBaseTimeDay = base_time_day + int(random.uniform(0.3, 1.5) * math.sqrt(base_time_day))
            else : 
                currBaseTimeDay = base_time_day - int(random.uniform(3, 8) * math.sqrt(base_time_day))
        #print("base time of a day is " + str(currBaseTimeDay))
        oneDayHist = getOneDayData(currDay, currBaseTimeDay)
        oneYearHist = [*oneYearHist, *oneDayHist] 
        currDay = currDay + datetime.timedelta(days = 1)
    #print("one year history length : " + str(len(oneYearHist)))
    df = pd.DataFrame(oneYearHist, columns=cols)   
    return df
    
def testOnedayTimes(times):
    res = []
    for i in range(0, times):
        res.append(getOneDayData(347, 2017))
    print(res)

yearHistory = generateYearHistory(347, 2017)
#yearHistory.loc[yearHistory['late_time'] != 0]
yearHistory.to_csv("yearHistoryTemplate.csv", index=False, encoding='utf8')
yearHistory

Unnamed: 0,id,client_id,departure_id,arrival_id,vehicle_id,dep_time,arr_time,late_time,base_price,sup_price
0,1,1730,82,71,1772,2017-01-01 00:15:00,2017-01-01 00:19:00,0,1.84,0.0
1,2,260,26,76,1441,2017-01-01 00:23:00,2017-01-01 00:27:00,0,3.04,0.0
2,3,726,150,91,38,2017-01-01 02:04:00,2017-01-01 02:31:00,0,12.52,0.0
3,4,1913,162,172,1791,2017-01-01 02:04:00,2017-01-01 02:18:00,0,8.36,0.0
4,5,1994,38,41,1874,2017-01-01 03:18:00,2017-01-01 04:17:00,0,49.36,0.0
5,6,791,145,25,307,2017-01-01 04:08:00,2017-01-01 05:07:00,0,42.51,0.0
6,7,1110,57,135,1504,2017-01-01 05:38:00,2017-01-01 06:02:00,0,21.91,0.0
7,8,484,41,7,524,2017-01-01 05:30:00,2017-01-01 06:30:00,0,34.75,0.0
8,9,1737,54,38,1082,2017-01-01 05:26:00,2017-01-01 06:16:00,0,23.65,0.0
9,10,1379,82,129,1067,2017-01-01 06:48:00,2017-01-01 06:57:00,0,7.97,0.0


In [2]:
# Editor: Tearsyu
# PDS
# Generate records of booking
# Constraints: - BIMyCar accepts bookings in 10 days
#              - Is it running in real time? 

import datetime
import pandas as pd
import random
import math

clients = 3000
parkings = 173

def generateOneDayBookings(startTime, currBaseTimeDay):
    currHours = startTime.hour
    currMonth = startTime.month
    # seedSign is used to add a random seed of times, some days there is more use times while some days less.
    seedSign = random.choice(["+", "-"]);
    hourTimes = 0
    res = []
    histId = 1
    seedBaseHourTimes = int(currBaseTimeDay/24)
    for currHours in range(currHours, 24):
        if seedSign == '+':
            # baseHourTimes gives a basic use times of one hour, it ranges from 347/12-squart(347/12) to 347/12+squart(347/12)
            baseHourTimes = seedBaseHourTimes + random.randint(0, seedBaseHourTimes + int(math.sqrt(seedBaseHourTimes)))
        else:
            baseHourTimes = seedBaseHourTimes - random.randint(0, seedBaseHourTimes - int(math.sqrt(seedBaseHourTimes)))
        
        # taux is an other random seed of ever hour, from 0.1 to 0.3 or from 1.4 to 3.5, 
        # this data comes from google map, the metro use times in every hour 
        if (currHours >= 0 and currHours <= 6):
            taux = random.uniform(0.1, 0.3)
            hourTimes = int(baseHourTimes * taux)
            
        elif(currHours in [7, 8, 12, 13, 19, 20]):
            taux = random.uniform(1.4, 3.5)
            hourTimes = baseHourTimes + int(taux * baseHourTimes)
        else:
            hourTimes = baseHourTimes
        
        for i in range(0, hourTimes):
            userId = random.randint(1, clients)
            depId = random.randint(1, parkings)
            arrId = random.choice([i for i in range(1, parkings) if i not in [depId]])
            depDateTime, arrDateTime = getTravelTime(startTime, currHours)
            depDateTime = depDateTime.replace(second = 0, microsecond=0)
            arrDateTime = arrDateTime.replace(second = 0, microsecond=0)
            travelTime = arrDateTime - depDateTime
            basePrice = getPriceBooking(travelTime)
            oneTube = [histId, userId, depId, arrId, depDateTime, arrDateTime, basePrice]
            res.append(oneTube)
            histId = histId + 1
    #print("res length : " + str(len(res)) + " index is " + str(histId))
    return res

#Need correct
def getTravelTime(startTime, currHours):
    depMin = random.randint(startTime.minute, 59)
    duration = random.randint(1, 60)
    newDay = datetime.datetime(startTime.year, startTime.month, startTime.day)
    depDateTime = newDay + datetime.timedelta(hours=currHours, minutes = depMin)
    arrDateTime = depDateTime + datetime.timedelta(minutes = duration)
    return depDateTime, arrDateTime;

def getPriceBooking(travelTime):
    travelTime = int(travelTime.total_seconds()/60)
    seedSign = random.choice(["+", "-"])
    if seedSign is "+" :
        basePrice = travelTime * 0.70 - random.uniform(0.3, 1.2) * travelTime / 4
    else :
        basePrice = travelTime * 0.70 - random.uniform(0.3, 1.2) * travelTime / 2
    if basePrice < 2:
        basePrice = 2
    return round(basePrice, 2)
    
def generateBookings(startTime, baseDayTimes):
    cols = ["id", "client_id", "departure_id", "arrival_id", "dep_time", "estimate_arr_time", "price"]
    oneDayHist = []
    allBookings = []
    seedSign = random.choice(["+", "-"])
    currDay = startTime
    for i in range(0, 60):
        if currDay.month in [7, 8]:
            currBaseTimeDay = baseDayTimes + int(random.uniform(2, 4) * math.sqrt(baseDayTimes))
        else:
            if seedSign is "+":
                currBaseTimeDay = baseDayTimes + int(random.uniform(0.3, 1.5) * math.sqrt(baseDayTimes))
            else : 
                currBaseTimeDay = baseDayTimes - int(random.uniform(3, 8) * math.sqrt(baseDayTimes))
        #print("now " + str(currDay))
        oneDayHist = generateOneDayBookings(currDay, currBaseTimeDay)
        allBookings = [*allBookings, *oneDayHist] 
        print(currDay)
        currDay = datetime.datetime(currDay.year, currDay.month, currDay.day) + datetime.timedelta(days = 1)
    #print("one year history length : " + str(len(oneYearHist)))
    df = pd.DataFrame(allBookings, columns=cols)   
    return df


startTime = datetime.datetime.strptime("2018-08-04", "%Y-%m-%d")
bookings = generateBookings(startTime, 347)
#bookings.to_csv("bookings.csv", index=False, encoding='utf8')
bookings

2018-08-04 00:00:00
2018-08-05 00:00:00
2018-08-06 00:00:00
2018-08-07 00:00:00
2018-08-08 00:00:00
2018-08-09 00:00:00
2018-08-10 00:00:00
2018-08-11 00:00:00
2018-08-12 00:00:00
2018-08-13 00:00:00
2018-08-14 00:00:00
2018-08-15 00:00:00
2018-08-16 00:00:00
2018-08-17 00:00:00
2018-08-18 00:00:00
2018-08-19 00:00:00
2018-08-20 00:00:00
2018-08-21 00:00:00
2018-08-22 00:00:00
2018-08-23 00:00:00
2018-08-24 00:00:00
2018-08-25 00:00:00
2018-08-26 00:00:00
2018-08-27 00:00:00
2018-08-28 00:00:00
2018-08-29 00:00:00
2018-08-30 00:00:00
2018-08-31 00:00:00
2018-09-01 00:00:00
2018-09-02 00:00:00
2018-09-03 00:00:00
2018-09-04 00:00:00
2018-09-05 00:00:00
2018-09-06 00:00:00
2018-09-07 00:00:00
2018-09-08 00:00:00
2018-09-09 00:00:00
2018-09-10 00:00:00
2018-09-11 00:00:00
2018-09-12 00:00:00
2018-09-13 00:00:00
2018-09-14 00:00:00
2018-09-15 00:00:00
2018-09-16 00:00:00
2018-09-17 00:00:00
2018-09-18 00:00:00
2018-09-19 00:00:00
2018-09-20 00:00:00
2018-09-21 00:00:00
2018-09-22 00:00:00


Unnamed: 0,id,client_id,departure_id,arrival_id,dep_time,estimate_arr_time,price
0,1,2219,92,16,2018-08-04 00:11:00,2018-08-04 00:57:00,8.28
1,2,2701,139,62,2018-08-04 00:18:00,2018-08-04 00:23:00,2.00
2,3,1947,2,36,2018-08-04 00:47:00,2018-08-04 01:33:00,4.74
3,4,1389,125,35,2018-08-04 00:52:00,2018-08-04 01:36:00,21.95
4,5,681,68,165,2018-08-04 00:01:00,2018-08-04 00:03:00,2.00
5,6,2927,111,144,2018-08-04 00:13:00,2018-08-04 00:16:00,2.00
6,7,2079,129,98,2018-08-04 00:00:00,2018-08-04 00:15:00,3.30
7,8,1125,28,124,2018-08-04 01:40:00,2018-08-04 02:33:00,29.82
8,9,526,63,131,2018-08-04 01:17:00,2018-08-04 02:10:00,23.07
9,10,2968,2,99,2018-08-04 01:38:00,2018-08-04 02:07:00,6.69


In [4]:
bookings.to_csv("bookings.csv", index=False,encoding='utf8')

In [138]:
# This function take current date time as argument, and return a possibility of late
# A late travel depend on PRECIP_TOTAL_DAY_MM and VISIBILITY_AVG_KM
# for ex: if PRECIP_TOTAL_DAY_MM is greater than 2mm, this travel has more 30% possibility to be late and go on
def getWeatherSeed(currDate):
    currDate = datetime.datetime.strftime(currDate,'%Y-%m-%d')
    getLine = weathers.loc[weathers['DATE'] == currDate]
    precipitation = float(getLine['PRECIP_TOTAL_DAY_MM'])
    visibility = float(getLine['VISIBILITY_AVG_KM'])
    precip_stat = 0
    visib_stat =0
    print(str(precipitation) + " | " + str(visibility))
    if precipitation > 0.2 and precipitation < 1.0:
        precip_stat = 200
    elif precipitation >= 1.0 and precipitation < 5.0 :
        precip_stat = 300
    elif precipitation >= 5.0:
        precip_stat = 550
    
    if visibility < 8.0 and visibility > 6.0:
        visib_stat = 50
    elif visibility <= 6.0 and visibility > 5.0:
        visib_stat = 100
    elif visibility <= 5.0 :
        visib_stat = 170
    return precip_stat, visib_stat

getWeatherSeed(datetime.datetime(2017, 10, 2))

2.3 | 9.75


(300, 0)

In [6]:
# Table du meteo
weathers.head(5)


Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,PRESSURE_MAX_MB,CLOUDCOVER_AVG_PERCENT,HEATINDEX_MAX_C,DEWPOINT_MAX_C,WINDTEMP_MAX_C,WEATHER_CODE_MORNING,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,OPINION
0,2017-01-01,3,-1,14,-2,2,0,0.0,67,10.0,1028,24.0,3,0,-5,113,113,116,météo très défavorable
1,2017-01-02,4,2,10,1,3,2,0.3,97,8.0,1029,64.5,4,2,0,122,143,113,météo très défavorable
2,2017-01-03,5,1,9,0,4,2,0.0,94,9.0,1031,12.125,5,2,-1,143,113,113,météo très défavorable
3,2017-01-04,6,3,18,2,5,4,0.6,96,7.0,1028,89.875,6,5,-1,122,266,143,météo très défavorable
4,2017-01-05,6,1,13,1,4,3,0.0,95,8.0,1036,24.125,6,3,-2,116,143,113,météo très défavorable


In [7]:
# Table parking

parking = pd.read_csv("new_parking.csv")
parking.head(5)

Unnamed: 0.1,Unnamed: 0,ID,NOM_PARC,ADRESS_GEO,Arrdt,TEL,geo_point_2d
0,161,1,AMPERE,93 TER RUE AMPERE,17,01 43 80 73 81,"48.885238365, 2.29863751292"
1,169,2,MALESHERBES ANJOU,20 TER BOULEVARD MALESHERBES,8,ND,"48.8724867829, 2.32181194243"
2,132,3,ECOLE DE MEDECINE,8 TER RUE DE L ECOLE DE MEDECINE,6,01 43 29 61 38,"48.8509460984, 2.34112737464"
3,13,4,VILLIERS,14 AVENUE DE VILLIERS,17,01 47 63 44 91,"48.8819558206, 2.3142426079"
4,111,6,VENDOME,26 TER PLACE VENDOME,1,01 42 60 50 00,"48.86789756, 2.33019790757"
