In [3]:
import requests
import datetime
import pandas as pd
from tqdm import tqdm

def scrape_trips(origin,destination,date):
    url = 'https://www.oncf-voyages.ma:8443/availability'
    headers_str = '''Accept: application/json, text/plain, */*
    Accept-Encoding: gzip, deflate, br
    Accept-Language: en-FR,en;q=0.9,fr-FR;q=0.8,fr;q=0.7,en-US;q=0.6
    Connection: keep-alive
    Content-Length: 359
    Content-Type: application/json
    Host: www.oncf-voyages.ma:8443
    Origin: https://www.oncf-voyages.ma
    Referer: https://www.oncf-voyages.ma/
    Sec-Fetch-Dest: empty
    Sec-Fetch-Mode: cors
    Sec-Fetch-Site: same-site
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36
    X-XSRF-TOKEN: null
    sec-ch-ua: "Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"
    sec-ch-ua-mobile: ?0
    sec-ch-ua-platform: "Windows"'''
    # Split the headers into a dictionary
    headers_dict = {}
    for header in headers_str.split('\n'):
        if ':' in header:
            key, value = header.split(': ', 1)
            headers_dict[key.strip()] = value.strip()
    # Define the raw body
    body = '{"origin":"'+origin+'","destination":"'+destination+'","originDate":"'+date+'T00:01:13+00:00","intervalTime":"","adulte":1,"kids":0,"comfort":2,"reducedTariff":{"0":{"code":"","priceCode":"","birthday":"","claimCode":""}},"destinationDate":null,"intervalTime-originDate":{"end":"06:00","start":"00:01","title":"Nuit","value":0,"disabled":false},"roundtrip":false,"_csrf":null}'
    # Send the POST request with the raw headers and body
    response = requests.post(url, headers=headers_dict, data=body)
    return response.content
    

In [6]:
#Loads data for stations & their respective ids
stations=pd.read_csv('stations.csv')
stations

Unnamed: 0,id,station_name
0,796,ADDAKHLA
1,190,AEROPORT Med V
2,745,AGADIR (SUPRAT.)
3,832,AGDZ (SUPRAT.)
4,884,AIN DEFALI
...,...,...
141,413,TOUABAA
142,77,YOUSSOUFIA
143,762,ZAG
144,834,ZAGORA (SUPRAT.)


In [16]:
from itertools import combinations
#This takes a list of stations you want to scrape and creates every possible combination of pairs.

stations_list=['Casa Oasis','FES','RABAT VILLE','TANGER VILLE','MARRAKECH',
                                       'AGADIR  (SUPRAT.)','SALE','LAAYOUNE']

ids_list=list(stations[stations['station_name'].isin(stations_list)].id)
ids_combinations=list(combinations(ids_list, 2))


In [6]:
def generate_date_array(start_date_str, end_date_str):
    """
    Generates an array of dates from start date to end date to be used on the request for trips
    """
    start_date = datetime.datetime.strptime(start_date_str, "%Y-%m-%d").date()
    end_date = datetime.datetime.strptime(end_date_str, "%Y-%m-%d").date()

    delta = datetime.timedelta(days=1)

    date_array = []

    while start_date <= end_date:
        date_array.append(start_date.strftime("%Y-%m-%d"))
        start_date += delta

    return date_array

In [7]:
#Chage dates according to your needs
start_date = "2023-04-07"
end_date = "2023-04-23"
dates = generate_date_array(start_date, end_date)
dates

['2023-04-07',
 '2023-04-08',
 '2023-04-09',
 '2023-04-10',
 '2023-04-11',
 '2023-04-12',
 '2023-04-13',
 '2023-04-14',
 '2023-04-15',
 '2023-04-16',
 '2023-04-17',
 '2023-04-18',
 '2023-04-19',
 '2023-04-20',
 '2023-04-21',
 '2023-04-22',
 '2023-04-23']

In [8]:
#This is the main scraping block it uses previous functions to scrape the data and saves it to responses.csv

pd.DataFrame({"origin":[],"destination":[],"response":[]}).to_csv('responses.csv',index=False)
for combination in tqdm(ids_combinations):
    origin=str(combination[0])
    destination=str(combination[1])
    for date in tqdm(dates):
        responses=[]
        origins=[]
        destinations=[]
        origins.append(origin)
        destinations.append(destination)
        response=scrape_trips(origin,destination,date)
        responses.append(response.decode('utf-8'))
        pd.DataFrame({"origin":origins,"destination":destinations,"response":responses}).to_csv('responses.csv', mode='a',index=False, header=False)

    

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
  0%|                                                                                           | 0/17 [00:00<?, ?it/s][A
  6%|████▉                                                                              | 1/17 [00:03<00:48,  3.05s/it][A
 12%|█████████▊                                                                         | 2/17 [00:06<00:49,  3.33s/it][A
 18%|██████████████▋                                                                    | 3/17 [00:09<00:45,  3.28s/it][A
 24%|███████████████████▌                                                               | 4/17 [00:13<00:44,  3.40s/it][A
 29%|████████████████████████▍                                                          | 5/17 [00:16<00:39,  3.31s/it][A
 35%|█████████████████████████████▎                                                     | 6/17 [00:28<01:09,  6.33s/it][A
 41%|██████████████

In [7]:
outdf=pd.read_csv("responses.csv")

In [8]:
import json

# Raw responses are hard to work with this function extracts useful information from a response
def process_response(response):
    
    data = json.loads(response)
    dateTimeDepartures=[]
    dateTimeArrivals=[]
    journeyDurations=[]
    expresses=[]
    type_trips=[]
    price_trips=[]
    price_sup_trips=[]
    departureStationIds=[]
    arrivalStationIds=[]
    for path in data['availability']['departurePath']:
        dateTimeDeparture=path['dateTimeDeparture']
        dateTimeArrival=path['dateTimeArrival']
        journeyDuration=path['journeyDuration']
        journeyDuration=path['journeyDuration']
        departureStationId=path['departureStationId']['description']['default']
        arrivalStationId=path['arrivalStationId']['description']['default']
        
        express=path['express']
        tripPrices=path['tripPrices']
        for price in tripPrices[1:]:
            type_trip=price['type']
            price_trip=price['data']['price']
            price_sup_trip=price['data']['priceSup']
            dateTimeDepartures.append(dateTimeDeparture)
            dateTimeArrivals.append(dateTimeArrival)
            journeyDurations.append(journeyDuration)
            expresses.append(express)
            type_trips.append(type_trip)
            price_trips.append(price_trip)
            price_sup_trips.append(price_sup_trip)
            departureStationIds.append(departureStationId)
            arrivalStationIds.append(arrivalStationId)
            
    return pd.DataFrame({'dateTimeDeparture':dateTimeDepartures,'dateTimeArrival':dateTimeArrivals,'journeyDuration':journeyDurations,
             'express':expresses,'type_trip':type_trips,'price_trip':price_trips,'price_sup_trip':price_sup_trips,
                        'departureStationId':departureStationIds,'arrivalStationId':arrivalStationIds})

In [9]:
#This function applies the processing to all responses and augments the data with distance between cities
#Be careful the hard coded distances array should correspond to order of cities
#If you get error relating to distance proprety, remove it and start analysis without it

def process_responses(responses):
    result_df=pd.DataFrame()
    for response in responses:
        try:
            df=process_response(response)
            result_df = pd.concat([result_df, df], ignore_index=True)
        except:
            continue
    result_df=result_df.drop_duplicates()
    ditances_dict=result_df[['departureStationId', 'arrivalStationId']].drop_duplicates()
    ditances_dict['distance']=[460.2,754.8,547.4,555,798.6,299,926.3,241.6,342.8,1375,532,207.1,193,305,322.9,291,574,251,238.3,73]
    result_df['dateTimeDeparture'] = pd.to_datetime(result_df['dateTimeDeparture'])
    result_df['dateTimeArrival'] = pd.to_datetime(result_df['dateTimeArrival'])
    result_df['day_name'] = result_df['dateTimeDeparture'].dt.day_name()
    result_df=result_df[result_df['type_trip']=='Semi Flex']
    result_df['hour_departure'] = result_df['dateTimeDeparture'].dt.hour
    result_df['hour_arrival'] = result_df['dateTimeArrival'].dt.hour
    result_df['days_from_earliest']=result_df['dateTimeDeparture'].apply(lambda x: (x - pd.Timestamp('2023-04-06 16:44:00', tz='UTC')).days)
    result_df['min_price']=result_df[['departureStationId','arrivalStationId']].apply(lambda x: result_df[(result_df['departureStationId']==x['departureStationId']) & (result_df['arrivalStationId']==x['arrivalStationId'])]['price_trip'].min(), axis=1)
    result_df['max_price']=result_df[['departureStationId','arrivalStationId']].apply(lambda x: result_df[(result_df['departureStationId']==x['departureStationId']) & (result_df['arrivalStationId']==x['arrivalStationId'])]['price_trip'].max(), axis=1)

    result_df['diff_from_min']=((result_df['price_trip']-result_df['min_price'])/result_df['min_price'])*100
    return pd.merge(result_df, ditances_dict, on=['departureStationId', 'arrivalStationId'])
        

        

In [14]:
responses_processed=process_responses(outdf.response)