# Auswertung der Parquet Dateien aus dem Echtzeitarchiv V14

## Import der Module und Setzen Parameter

In [2]:
import duckdb
import pandas as pd
import seaborn as sns
import sys
import openpyxl

import datetime as dt
import importlib

from dotenv import load_dotenv, dotenv_values
import logging

In [3]:
log_file = f"log/log_rt.txt"
logging.basicConfig(filename=log_file, 
                        level=logging.DEBUG,
                        style="{",
                        format="{asctime} [{levelname:8}] {message}",
                        datefmt="%d.%m.%Y %H:%M:%S")

load_dotenv()

True

In [4]:
sys.path.append('/home/zvbn/python/rt2')

In [5]:
from class_rt_duck import rt_duck

In [6]:
logging.info("Auswertung RT aus parquet gestartet")

In [7]:
config = dotenv_values(".env")
#config

In [8]:
pd.options.display.max_columns = 100

In [9]:
jetzt = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
gestern= (dt.date.today() - dt.timedelta(1)).strftime('%Y-%m-%d')
letzte07tage= (dt.date.today() - dt.timedelta(7)).strftime('%Y-%m-%d')
letzte14tage= (dt.date.today() - dt.timedelta(14)).strftime('%Y-%m-%d')
letzte21tage= (dt.date.today() - dt.timedelta(21)).strftime('%Y-%m-%d')

print(jetzt, letzte21tage)

2025-01-10 19:29:26 2024-12-20


## Funktionen

In [10]:
def replace_german_special_characters(text):
    replacements = {
        'ä': 'ae',
        'ö': 'oe',
        'ü': 'ue',
        'Ä': 'Ae',
        'Ö': 'Oe',
        'Ü': 'Ue',
        'ß': 'ss'
    }
    
    for german_char, replacement in replacements.items():
        text = text.replace(german_char, replacement)
    
    return text

In [26]:
#für die Formatierung der Ausgabe in html
func_proz = lambda s: str(int((1-s) * 1000)/10) + '%' if str(int(s)) != '-1' else '-'
func_date = lambda s: s.dt.strftime('%m/%d/%Y')

## CSS Styles

In [23]:
#Zellformatierung CSS
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal; font-family: sans-serif;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #FFFFFF; color: #000000; font-family: sans-serif;'
}

td = {'selector' : 'td', 'props': 'text-align:right; font-family: sans-serif'}

## Testen der class

In [11]:
rt = rt_duck()
rt

<class_rt_duck.rt_duck at 0x7f7b6985cd50>

In [12]:
#Schließen der Verbindung
#rt.verbindung_schließen()

In [13]:
rt.create_table_fahrten(server = 'prod')

Table 'fahrten' created.


In [14]:
rt.create_table_zusatz(server = 'prod')
rt.create_table_verlauf(server = 'prod')
rt.create_table_matrix(server = 'prod')

Table 'zusatz' created.
Table 'verlauf' created.
Table 'matrix' created.


In [15]:
rt.cursor.sql("select distinct lineid from fahrten")

┌───────────────┐
│    lineid     │
│    varchar    │
├───────────────┤
│ de:VBN:431:   │
│ de:VBN:440:   │
│ de:VBN:5:     │
│ de:VBN:380:   │
│ de:VBN:460:   │
│ de:VBN:745:1  │
│ de:VBN:314:   │
│ de:VBN:309:   │
│ de:VBN:308:   │
│ de:VBN:94:    │
│     ·         │
│     ·         │
│     ·         │
│ de:VBN:748:2  │
│ de:VBN:544:   │
│ de:VBN:540:   │
│ de:VBN:760:3  │
│ de:VBN:14:    │
│ de:VBN:553:2: │
│ de:VBN:1E:491 │
│ de:VBN:541:   │
│ de:VBN:378:   │
│ de:VBN:E:913  │
├───────────────┤
│   744 rows    │
│  (20 shown)   │
└───────────────┘

In [45]:
sql_pivot = """ pivot(
              select datum::date as datum, buendel, 
                round((count (*) filter (realtimeHasEverBeenReported = true) / count(*) ) * 100, 1) as anteil_ez  
              from fahrten f 
                join linien l on f.lineid_short = l.dlid 
              where 
                f.datum > (current_date - interval 60 day)
              and ebene in ('1+', '1', '2', 'Stadt')
              group by all)
              on buendel
              using sum(anteil_ez)
              order by datum desc"""

sql = """ 
              select datum::date::text as datum, buendel, 
                round((count (*) filter (realtimeHasEverBeenReported = true) / count(*) ) * 100, 1) as anteil_ez  
              from fahrten f 
                join linien l on f.lineid_short = l.dlid 
              where 
                f.datum > (current_date - interval 60 day)
              and ebene in ('1+', '1', '2', 'Stadt')
              group by all"""

# Ausgabe https://daten.zvbn.de/rt_archiv/log_12_pivot_neu.html
rt.cursor.sql(sql_pivot).df().style.background_gradient(cmap=sns.diverging_palette(0, 120, as_cmap=True))

Unnamed: 0,datum,AM Ost,AM Süd,AM West,BHV,DEL,DH Nordost,DH Nordwest,DH Südost,DH Südwest,HB Stadt,HB Straßenbahn,OHZ Mitte,OHZ Ost,OHZ West,OL Nord,OL Stadt,OL Südost,OL West,VER Nord,VER Ost,VER Südwest,WM Nord,WM Süd
0,2025-01-09 00:00:00,72.5,21.0,97.9,67.5,93.6,98.3,74.3,76.5,71.6,97.8,99.6,94.3,86.1,100.0,100.0,99.6,98.5,80.7,100.0,87.2,98.8,77.8,99.1
1,2025-01-08 00:00:00,85.2,17.7,100.0,67.3,95.8,100.0,95.0,84.3,100.0,97.3,99.6,90.8,82.6,100.0,100.0,99.7,98.5,98.2,100.0,86.7,95.2,87.5,99.1
2,2025-01-07 00:00:00,84.7,46.8,97.9,66.8,87.9,100.0,96.4,98.3,100.0,98.2,99.7,92.0,82.6,98.5,100.0,98.7,98.5,96.3,100.0,88.6,98.4,87.5,100.0
3,2025-01-06 00:00:00,87.3,19.4,96.9,67.5,88.7,100.0,96.4,98.2,99.3,97.3,99.3,93.1,93.0,98.5,100.0,98.1,95.4,97.2,100.0,87.7,97.2,85.4,91.9
4,2025-01-05 00:00:00,100.0,13.3,100.0,60.5,98.6,100.0,100.0,95.0,100.0,96.7,99.9,90.6,75.8,100.0,,100.0,0.0,100.0,100.0,100.0,100.0,40.0,100.0
5,2025-01-04 00:00:00,95.3,6.9,100.0,60.1,94.5,100.0,100.0,98.8,100.0,96.1,100.0,86.7,68.4,96.8,100.0,99.9,100.0,100.0,100.0,87.1,81.8,80.8,100.0
6,2025-01-03 00:00:00,92.9,19.6,98.9,60.8,95.7,100.0,100.0,98.0,100.0,98.0,100.0,80.2,91.4,100.0,100.0,99.5,95.7,100.0,100.0,84.8,90.2,93.0,99.0
7,2025-01-02 00:00:00,92.9,6.5,100.0,61.1,95.6,100.0,100.0,99.0,100.0,98.1,99.5,91.9,97.1,98.5,100.0,99.3,95.8,100.0,100.0,84.8,93.7,82.0,95.8
8,2025-01-01 00:00:00,92.9,13.3,100.0,60.8,95.7,100.0,100.0,100.0,100.0,96.6,99.6,100.0,66.7,100.0,,99.8,0.0,100.0,100.0,100.0,100.0,54.3,100.0
9,2024-12-31 00:00:00,96.0,0.0,100.0,56.8,93.9,100.0,100.0,100.0,100.0,96.3,99.9,86.7,73.4,95.2,0.0,100.0,100.0,100.0,100.0,87.1,81.5,0.0,96.6


In [47]:
df = rt.cursor.sql(sql).df()
df_pivot = df.pivot(index='datum', columns='buendel', values='anteil_ez')
df_pivot.sort_values('datum', ascending=False).style.background_gradient(cmap="RdYlGn", axis = None,  vmin=0.0, vmax=95)\
    .highlight_null(color='white').format(formatter = '{:.1f}%', precision=1, na_rep='-', thousands=" ", decimal= ',').set_table_styles([index_names, headers, td])\
        .to_html('/var/www/rt_archiv/log_12_pivot_neu.html', encoding='LATIN1')

In [17]:
rt.anzahl_fahrten_betreiber()

Unnamed: 0,vu,count_ges,count_rt,heute_minus_1_ges,heute_minus_1_rt,anteil_heute_minus_1,heute_minus_2_ges,heute_minus_2_rt,anteil_heute_minus_2,heute_minus_3_ges,heute_minus_3_rt,anteil_heute_minus_3
0,Bremer Straßenbahn AG,657404,286667,5813,5729,98.5%,5832,5729,98.2%,5800,5725,98.7%
1,Verkehr und Wasser GmbH (VWG),237364,95193,2124,2115,99.6%,2122,2113,99.6%,2120,2086,98.4%
2,BREMERHAVEN BUS,156174,41884,1336,857,64.2%,1336,855,64.0%,1336,847,63.4%
3,KVG Stade GmbH & Co. KG,133433,4881,1074,134,12.5%,1074,139,12.9%,1074,141,13.1%
4,Verkehrsbetriebe Oldenburger Land,125606,46361,1457,885,60.7%,1456,1204,82.7%,1464,1228,83.9%
5,Eisenbahnen und Verkehrsbetriebe Elbe-Weser GmbH,68358,10771,781,275,35.2%,782,282,36.1%,783,282,36.0%
6,NordWestBahn,55552,23880,437,433,99.1%,437,437,100.0%,503,499,99.2%
7,Delbus GmbH & Co. KG,53264,21674,453,424,93.6%,453,434,95.8%,453,398,87.9%
8,Reisedienst von Rahden GmbH & Co. KG,46358,18228,578,529,91.5%,581,520,89.5%,580,525,90.5%
9,Weser-Ems-Bus Betrieb Bremen,44637,11402,550,330,60.0%,549,321,58.5%,549,329,59.9%


In [20]:
rt.anzahl_fahrten_betreiber().to_html('/var/www/rt_archiv/anzahl_fahrten_betreiber.html', encoding='LATIN1')

## Ermitteln der Fahrten, die nur 0 Min senden

In [15]:
rt.cursor.sql("""select ex_lineid, fnr,min(operday) as start, max(operday) as ende ,count(*) as count from (
                select * from 
                    (select operday, ex_lineid, fnr, avg(dep_del) as avg_del
                    from verlauf 
                    where dep_del is not null
                    and has_rt = true
                    group by all)
                where avg_del = 0 and ex_lineid like 'de:VBN:6__:%'
                order by ex_lineid) 
              
              group by all
              order by count desc
              """).df().to_excel('out/verlauf_0min.xlsx', index=False)

In [16]:
rt.cursor.sql("select * from verlauf where fnr = '1630018'").df().to_excel('out/verlauf_1630018.xlsx', index=False)

In [None]:
suffix = 'mitte'
#auswahl_linien = '680|660|N68'
auswahl_linien = '630|670|N63|N67'

df_auswahl_ohne_rt = rt.cursor.sql(f"""select * from
              (select lineshort, min(datum)::date as min_datum, max(datum)::date as max_datum, fnr, 
              count(* ) as anzahl, 
              count(* ) filter (hasRealtime  = false ) as anzahl_ohne_rt, 
              anzahl_ohne_rt / count(* ) as proz_ohne_rt
              from fahrten  
              where lineid  SIMILAR TO '.*({auswahl_linien}).*' 
              -- and hasRealtime  = false 
              and datum  >= (current_date() - interval 28 days)
              group by all
              )
              where anzahl_ohne_rt > 1
              order by proz_ohne_rt desc

              -- limit 10""").df()

df_zusatz = rt.cursor.sql(f"""select * from zusatz where lineid  SIMILAR TO 'de:VBN:.*({auswahl_linien}).*' """).df()

ohne_rt_xl = f"out/rt_ohne_realtime_{suffix}.xlsx"
sn01 = 'ohne_rt'
with pd.ExcelWriter(ohne_rt_xl, engine='openpyxl') as writer:
    df_auswahl_ohne_rt.to_excel(writer, sheet_name=sn01, index=False)
    worksheet = writer.sheets[sn01]
    worksheet.freeze_panes = 'a2'

    worksheet.column_dimensions['B'].width = 15
    worksheet.column_dimensions['C'].width = 15
    worksheet.auto_filter.ref = worksheet.dimensions

    # Format the 'Zeit' column as date
    for cell in worksheet['B']:  # Assuming 'Zeit' is in column D
        if cell.row == 1:  # Skip the header row
            continue
        cell.number_format = 'YYYY-MM-DD'

    # Format the 'Zeit' column as date
    for cell in worksheet['C']:  # Assuming 'Zeit' is in column D
        if cell.row == 1:  # Skip the header row
            continue
        cell.number_format = 'YYYY-MM-DD'

    # Format the 'Prozent' column as percentage
    for cell in worksheet['G']:  # Assuming 'Prozent' is in column D
        if cell.row == 1:  # Skip the header row
            continue
        cell.number_format = '0.0%'
 
df_zusatz

In [None]:
df_linien_quote_rt = rt.cursor.sql(f"""select * from
              (select lineshort, min(datum)::date as min_datum, max(datum)::date as max_datum,  
              count(* ) as anzahl, 
              count(* ) filter (hasRealtime  = false ) as anzahl_ohne_rt, 
              anzahl_ohne_rt / count(* ) as proz_ohne_rt
              from fahrten  
                                   
                                   
             where 
              -- and hasRealtime  = false 
              datum  >= (current_date() - interval 28 days)
              group by all
              )
              where anzahl_ohne_rt > 1
              order by proz_ohne_rt desc

              -- limit 10""").df()

df_linien_quote_rt

In [None]:
df_zusatz

### Auswertung Matrix nach Verlauf Zeitpunkt der Meldung

In [20]:
df_matrix = rt.cursor.sql("""select m.operatingDay::date, m.lineShortName, m.journeyId, v.index, 
                          m.stationName, m.scheduleDeparture,m.delay_minutes_arrival, m.delay_minutes_departure, m.timestamp, v.arr_del, v.dep_del
                from matrix m
                left join verlauf v on 
                          m.operatingDay = v.operday and 
                          m.lineShortName = v.lineshortname and 
                          m.journeyId = v.fnr and 
                          m.stationName = v.station_name
                where stop_cancelled = false
              and m.lineShortName = 'RS3'
              order by  m.operatingDay, m.externalLineId, m.journeyId, v.index,  m.timestamp 
              
              -- limit 20""").df()

In [None]:
auswahl_linien = '630|670|N68|N63|N67'
df_zusatz = rt.cursor.sql(f"""
                select datum::date as datum, lineshort,lineid ,fnr,  vu 
                from zusatz 
                where                       

                    lineid SIMILAR TO 'de:VBN:.*({auswahl_linien}).*' and 
                    -- and vu like 'Reisedienst von Rahden%' 
                    datum::date >= (current_date - interval 30 day)
                group by all 
                order by lineshort, fnr """).df()

df_zusatz

#rt.cursor.sql(f"""select * from zusatz where lineid  SIMILAR TO 'de:VBN:.*({auswahl_linien}).*' and datum::date >= (current_date - interval 30 day)""").df()

In [None]:
rt.cursor.sql("select min(datum )::date as min_date, max(datum)::date as amx_date, count(*) as anzahl from fahrten")

In [24]:
rt.create_vw_buendel('TN 5 CUX')

In [None]:
rt.cursor.sql("select * from vw_buendel").df()

### Häufung von Fahrten ohne Echtzeit

In [26]:
df_fahrten_ohne_ez = rt.cursor.sql("""
              
                select datum::date as datum, ebene, lineshort , fnr, hasrealtime
               
                from vw_buendel 
                where datum >= (current_date - interval 30 day) and hasrealtime = false
                group by all
                order by ebene, lineshort, fnr
    
              """).df()

df_fahrten_ohne_ez_zusatz = df_fahrten_ohne_ez.merge(df_zusatz, left_on = ['datum', 'fnr'], right_on = ['datum', 'fnr'], how='left')
df_fahrten_ohne_ez_zusatz.query("~vu.isnull()") 

df_fahrten_ohne_ez_zusatz[['lineshort_x','datum','fnr']].groupby(['lineshort_x','fnr'], as_index=False)\
    .agg(datum_min=('datum', 'min'), datum_max=('datum', 'max'), count=('datum', 'count')).sort_values('count', ascending=False)\
    .to_excel('out/rt_fahrten_ohne_ez_zusatz.xlsx', index=False)

In [None]:
df_fahrten_ohne_ez_zusatz.query("~vu.isnull()")

In [None]:
interval_auswertung = 21
df_fahrten_mit_nicht_vollstaendiger_echtzeit = rt.cursor.sql(f"""
              select * from 
                (select ebene, lineshort , fnr, count(*) as anz, count(*) filter (hasRealtime) as anz_rt, 
                (anz - anz_rt) as f_ohne_rt ,round(anz_rt/anz,2) as quote,
                max(datum::date) filter (hasRealtime) as letzte_lieferung_echtzeit
                from vw_buendel 
                where datum >= (current_date - interval {interval_auswertung} day)
                group by all
                order by ebene, lineshort, fnr)
              where f_ohne_rt > 1 and ebene in ('1+','1', '2') 
                order by f_ohne_rt desc                                                             
            
              """).df()

df_fahrten_mit_nicht_vollstaendiger_echtzeit

In [29]:
xl = 'out/nicht_vollstaendig.xlsx'
sn01 = '01 fahrten_rt_kl_100_roz'
sn02 = '02 zusatzfahrten'
sn03 = '03 ohne ez merge zusatz'

with pd.ExcelWriter(xl, engine='openpyxl') as writer: 
    df_fahrten_mit_nicht_vollstaendiger_echtzeit.to_excel(writer, index=False, sheet_name=sn01)
    writer.book[sn01].freeze_panes = 'A2'
    writer.book[sn01].auto_filter.ref='A:H'

    df_zusatz.to_excel(writer, index=False, sheet_name=sn02)
    writer.book[sn02].freeze_panes = 'A2'
    writer.book[sn02].auto_filter.ref='A:H'

    df_fahrten_ohne_ez_zusatz.to_excel(writer, index=False, sheet_name=sn03)
    writer.book[sn03].freeze_panes = 'A2'
    writer.book[sn03].auto_filter.ref='A:H'


In [None]:
q = rt.cursor.sql("""
                   (select 
                    datum::date as datum, ebene, lineshort, lineid_short, count(*) anz,
                    count(*) filter (hasRealtime) anz_rt, round(anz_rt/ anz,2) anteil_rt, 
                    max(datum) filter (hasRealtime) letzte_lieferung
                    from vw_buendel 
                    where datum >= date_trunc('month', (date_trunc('month',current_date) - interval 1 day)::date)
                    and datum <= (date_trunc('month',current_date) - interval 1 day)::date
                  
                    group by all

                    order by datum::date)
                  """)
#q.filter("lineshort in ('S35', '350')") #mit filter einfache Abfragen

q

In [None]:
#Abfrage für den letzten Monat
q_pivot_lm = rt.cursor.sql("""
                    pivot (select 
                            datum::date as datum, ebene, lineshort, lineid_short, count(*) anz,
                            count(*) filter (hasRealtime) anz_rt, round(anz_rt/ anz,2) anteil_rt
                        from vw_buendel 
                        where datum >= date_trunc('month', (date_trunc('month',current_date) - interval 1 day)::date)
                            and datum <= (date_trunc('month',current_date) - interval 1 day)::date
                        group by all
                        )
                    on datum
                    using sum(anteil_rt)
                    group by lineshort, ebene
                    order by ebene, lineshort""")

q_pivot_lm.df().fillna('-')

## Ausgabe je Bündel als html

In [32]:
list_buendel = rt.cursor.sql("select distinct buendel from linien where buendel not in ('nahsh')").df()['buendel'].to_list()

In [33]:
#Zellformatierung CSS
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal; font-family: sans-serif; font-size: 15px;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #FFFFFF; color: #000000; font-family: sans-serif; font-size: 15px;text-orientation: upright;'
}

td = {'selector' : 'td', 'props': 'text-align:right; font-family: sans-serif; font-size: 14px;'}

In [None]:
rt.cursor.sql(f"""describe fahrten""")

In [None]:
func_proz = lambda s: str(int((1-s) * 1000)/10) + '%' if str(int(s)) != '-1' else '-'
func_date = lambda s: s.dt.strftime('%m/%d/%Y')

interval_auswertung = 21

for b in list_buendel:
    print(b, b.replace(' ', '_').lower(), replace_german_special_characters(b).replace(' ', '_').lower())

    rt.create_vw_buendel(b)
    #Abfrage für die letzten 30 Tage
    q_pivot_lm = rt.cursor.sql(f"""
                        pivot (select 
                                datum::date as datum, ebene, lineshort, lineid_short, count(*) anz,
                                count(*) filter (hasRealtime) anz_rt, round(anz_rt/ anz,2) anteil_rt
                            from vw_buendel 
                            where datum >= (current_date - interval {interval_auswertung} day)
                            group by all
                            )
                        on datum
                        using sum(anteil_rt)
                        group by lineshort, ebene
                        order by ebene, lineshort""")
    
    #Liste der Fahrten ohne Echtzeit die häufiger als 1 mal vorkommen
    df_fahrten_mit_nicht_vollstaendiger_echtzeit = rt.cursor.sql(f"""
                select * from 
                    (select ebene, lineshort , fnr, count(*) as anz, count(*) filter (hasRealtime) as anz_ez, 
                    (anz - anz_ez) as fahrten_ohne_ez ,round(anz_ez/anz,2) as quote,
                    max(datum::date) filter (hasRealtime) as letzte_lieferung_echtzeit
                    from vw_buendel 
                    where datum >= (current_date - interval {interval_auswertung} day)
                    group by all
                    order by ebene, lineshort, fnr)
                where fahrten_ohne_ez > 1 and ebene in ('1+','1', '2','Nacht') 
                    order by fahrten_ohne_ez desc                                                             
                
                """).df()
    
    df_fahrten_ohne_ez = rt.cursor.sql(f"""              
                select datum::date as datum, ebene, lineshort , fnr, hasrealtime               
                from vw_buendel 
                where datum >= (current_date - interval {interval_auswertung} day) and hasrealtime = false
                group by all
                order by ebene, lineshort, fnr
    
              """).df()
    
    df_fahrten_gesamt = rt.cursor.sql(f"""              
                select *               
                from vw_buendel 
                where datum >= (current_date - interval {interval_auswertung} day) 
                order by ebene, lineshort, fnr
    
              """).df()
    
    html_zusatz_table = 'html/pre_zusatz.html'
    df_fahrten_ohne_ez_zusatz = df_fahrten_ohne_ez.merge(df_zusatz, left_on = ['datum', 'fnr'], right_on = ['datum', 'fnr'], how='left')
    df_fahrten_ohne_ez_zusatz.query("~vu.isnull()").to_html(html_zusatz_table, index=False)

    html_pre_table = 'html/pre_table.html'
    df_fahrten_mit_nicht_vollstaendiger_echtzeit.to_html(html_pre_table, index=False)

    html_pre_pivot = 'html/pre_pivot.html'
    q_pivot_lm.df().style.background_gradient(cmap="RdYlGn", axis = None,  vmin=0.5, vmax=1).highlight_null(color='white')\
        .format( precision=2, na_rep='-', thousands=" ")\
        .highlight_null(color='white')\
        .set_table_styles([index_names, headers, td])\
        .to_html(html_pre_pivot)
    
    # Save the HTML table to a file (optional)   
    with open(html_pre_pivot, 'r') as file:
        html_pre_pivot = file.read()
    
    # Load the HTML page template
    with open('html/template.html', 'r') as file:
        html_template = file.read()

    # Insert the HTML table into the template
    title = f"Echtzeitquote Bündel {b} je Linie erstellt: {dt.datetime.now().strftime('%d.%m.%Y %H:%M')}" 
    html_page = html_template.replace('{{ html_pivot }}', html_pre_pivot).replace('{{ html_title }}', title)

    if df_fahrten_mit_nicht_vollstaendiger_echtzeit.shape[0] > 0:
        html_page = html_page.replace('{{ html_table }}', df_fahrten_mit_nicht_vollstaendiger_echtzeit.to_html(index=False))
    else:
        html_page = html_page.replace('{{ html_table }}', "Keine Häufung Fahrten ohne Echtzeit")

    if df_fahrten_ohne_ez_zusatz.query("~vu.isnull()").shape[0] > 0:
        html_page = html_page.replace('{{ html_table_zusatz }}', df_fahrten_ohne_ez_zusatz.query("~vu.isnull()").to_html(index=False))
    else:
        html_page = html_page.replace('{{ html_table_zusatz }}', "Keine Zusatzfahrten mit gleicher Fahrtnummer")

    # Save the combined HTML page to a file
    html_combined = f"/var/www/rt_archiv/buendel/rt_{replace_german_special_characters(b).replace(' ', '_').lower()}.html"


    with open(html_combined, 'w') as file:
        file.write(html_page)


    #Ausgabe der wichtigen Ergebnisse als Excel
    xl = f"buendel_stat/{replace_german_special_characters(b).replace(' ', '_').lower()}_stat.xlsx"

    sn00 = '01 hilfe'
    sn02 = '02 statistik'
    sn03 = '03 fahrten gesamt'  

    with pd.ExcelWriter(xl, engine='openpyxl') as writer:         
        q_pivot_lm.df().to_excel(writer, index=True, sheet_name=sn02)
        writer.book[sn02].freeze_panes = 'A2'
        writer.book[sn02].auto_filter.ref='A:H'

        df_fahrten_gesamt.to_excel(writer, index=False, sheet_name=sn03)
        writer.book[sn03].freeze_panes = 'A2'
        writer.book[sn03].auto_filter.ref='A:N'

    # Öffnen des Workbooks und Anwenden der Formatierung
    wb = openpyxl.load_workbook(xl)

    #Erstellen des Hilfeblattes an erster Position
    wb.create_sheet(sn00, index=0)
    sheet = wb[sn00]
    sheet['A1'] = f"Erstellt: {dt.datetime.now().strftime('%Y-%m-%d %H:%M')}"
    sheet['A2'] = f"Erläuterung der Werte in der Tabelle"
    sheet['A3'] = f"Blatt {sn02} enthält die Echtzeitquote der Linien des Bündels {b} für die letzten {interval_auswertung} Tage"

    wb.save(xl)

       

In [None]:
q_pivot_lm.df()

In [None]:
rt.cursor.sql(f"""
                select * from 
                    (select ebene, lineshort , fnr, count(*) as anz, count(*) filter (hasRealtime) as anz_ez, 
                    (anz - anz_ez) as fahrten_ohne_ez ,round(anz_ez/anz,2) as quote,
                    max(datum::date) filter (hasRealtime) as letzte_lieferung_echtzeit
                    from vw_buendel 
                    where datum >= (current_date - interval {interval_auswertung} day)
                    group by all
                    order by ebene, lineshort, fnr)
                where fahrten_ohne_ez > 1 and ebene in ('1+','1', '2','Nacht') 
                    order by fahrten_ohne_ez desc                                                             
                
                """).df().shape[0]

In [None]:
html_template

In [None]:
rt.anzahl_fahrten_betreiber().df()

In [34]:
logging.info(f"Anzahl Fahrten gesamt {rt.anzahl_fahrten()}")

## Ohne class

In [35]:
con = duckdb.connect()

In [36]:
con.sql(f"""INSTALL postgres;
LOAD postgres;
ATTACH 'dbname=zvbn_postgis user={config['POSTGRES_USER']} host=127.0.0.1 password={config['POSTGRES_PW']}' AS db_dm (TYPE POSTGRES, READ_ONLY);""")

In [None]:
con.sql("create or replace table lin_buendel as select * from db_dm.basis.lin_buendel")
con.sql("select * from lin_buendel")

In [None]:
sql_lin = """
        Create or replace table linien as 
        SELECT nummer AS linie, buendel, ebene, dlid, id 
        FROM db_dm.basis.linien 
        WHERE buendel IS NOT NULL AND aktiv IS TRUE 
        ORDER BY buendel, ebene, nummer """
con.sql(sql_lin)
con.sql("select * from linien")

### Abruf der Parquet Files (Tagespakete)

In [None]:
server = 'prod'
con.sql(f"create or replace table fahrten as select * from read_parquet('out/parquet/{server}/fahrten*.parquet',  union_by_name = true, filename = true)")
con.sql(f"create or replace table verlauf as select * from read_parquet('out/parquet/{server}/verlauf*.parquet',  union_by_name = true, filename = true)")
con.sql(f"create or replace table zusatz as select * from read_parquet('out/parquet/{server}/zusatz*.parquet',  union_by_name = true, filename = true)")

### Ermitteln und Löschen von nicht gewollten Betreibern

In [40]:
#con.sql("select distinct vu from fahrten where vu like '%Weser%'")

In [None]:
con.sql("describe fahrten")

In [None]:
con.sql("select count(*), datum from fahrten group by datum order by datum")

In [43]:
if False: #True / False um ggf. weiterhin alles durchlaufen zu lassen
    print('Löschen von Betreibern')
    con.sql("delete from fahrten where vu not in ('Weser-Ems-Bus Betrieb Bremen', 'Weser-Ems-Bus Auftragnehmerleistungen')")
    #con.sql("delete from verlauf where vu not in ('Weser-Ems-Bus Betrieb Bremen', 'Weser-Ems-Bus Auftragnehmerleistungen')")
    #con.sql("delete from zusatz where vu not in ('Weser-Ems-Bus Betrieb Bremen', 'Weser-Ems-Bus Auftragnehmerleistungen')")

In [None]:
con.sql(f"select count(*) from fahrten where datum >= (current_date - interval 100 days)").df().values.tolist()[0][0]

In [None]:
anzahl_fahrten = con.sql(f"select count(*) from fahrten where datum >= '{letzte14tage}'").df().values.tolist()[0][0]
print(f"""Anzahl Fahrten: {anzahl_fahrten},  Länge Verlauf: {con.sql("select count(*) from verlauf").df().values.tolist()[0][0]}    """) 

In [None]:
con.sql("""select 
            datum, 
            fahrtstartstationname, 
           strftime( cast(fahrtstarttime as TIMESTAMPTZ), '%H:%M') as fahrtstart,
           fahrtendstationname,
           strftime( cast(fahrtendtime as TIMESTAMPTZ), '%H:%M') as fahrtende,
            
            deviceid, 
            split_part(deviceid, '-', 2) as fnr, 
            cast(((cast(split_part(split_part(deviceid, '-', 3), '#', 1) as int64) - 8000000000000) / 1000) as int64) as m2, 
        from fahrten 
        where deviceid like '%680%DBRB%' and datum = '2024-10-29'
        order by datum, fahrtstarttime
        
        """).df()
#.to_excel('out/web.xlsx', index=False)

### Anzahl der Fahrten je Betreiber

In [None]:
con.sql("select journeyOperator, count(journeyOperator) as count from verlauf group by journeyOperator order by count")

### Fahrten mit hohen Verspätungen

In [None]:
con.sql("select distinct deviceid from verlauf where dep_del > 100").df()

In [None]:
con.sql("describe fahrten")

### Verkürzung der DLID
- Zum Teil weren bei mehreren Betreibern einer Linie TLID mit vierteiliger DLID geliefert 
- Verkürzung ermöglicht die Verknüpfung mit Liste aus DM

In [None]:
con.sql("alter table fahrten add column if not exists lineid_short VARCHAR")
con.sql("""update fahrten 
        set lineid_short = concat_ws(':', split_part(lineid,':', 1), split_part(lineid,':', 2), split_part(lineid,':', 3))""")
con.sql("""select distinct lineid, 
        concat_ws(':', split_part(lineid,':', 1), split_part(lineid,':', 2), split_part(lineid,':', 3)) 
        from fahrten""")

### Über HIM gemeldete Ausfälle (ts_reported_cancelled gefüllt)

In [51]:
df_fahrten_ausfall_him = con.sql(f"""
                              select vu, fnr, ts_reported_cancelled, journey_cancelled 
                              from fahrten f 
                              where ts_reported_cancelled != '' and f.datum >= '{letzte14tage}'""").df()

### Echzeitquote

#### nach Linie und Betreiber

In [None]:
df_ez_quote_betreiber = con.sql(f"""
        select l.buendel, l.ebene,f.datum, f.vu, f.lineshort,f.lineid_short, count(f.hasRealtime) filter (f.hasRealtime = True) ez_true, count(f.*) count, 
        round(ez_true / count * 100, 1) anteil_ez
        from fahrten f
        left outer join linien l on f.lineid_short = l.dlid
        where f.datum >= '{letzte14tage}'              
        group by f.lineid_short, f.vu, f.datum, f.lineshort, f.lineid_short, l.buendel, l.ebene
        order by f.vu, f.lineid_short
        """).df()
df_ez_quote_betreiber['buendel'] = df_ez_quote_betreiber['buendel'].fillna('-')
df_ez_quote_betreiber['ebene'] = df_ez_quote_betreiber['ebene'].fillna('-')
anteil_ez_pivot_betreiber = pd.pivot_table(df_ez_quote_betreiber, index=['buendel','ebene', 'vu', 'lineshort'], columns='datum', values='anteil_ez').reset_index()
anteil_ez_pivot_betreiber

#### nach Linie (ohne Betreiber)

In [None]:
df_ez_quote_o_betreiber = con.sql(f"""
        select l.buendel, l.ebene,f.datum, f.lineshort,f.lineid_short, count(f.hasRealtime) filter (f.hasRealtime = True) ez_true, count(f.*) count, 
        round(ez_true / count * 100, 1) anteil_ez
        from fahrten f        
        left outer join linien l on f.lineid_short = l.dlid      
        where f.datum >= '{letzte14tage}'        
        group by f.lineid_short, f.datum, f.lineshort, f.lineid_short, l.buendel, l.ebene
        order by f.lineid_short
        """).df()
df_ez_quote_o_betreiber['buendel'] = df_ez_quote_o_betreiber['buendel'].fillna('-')
df_ez_quote_o_betreiber['ebene'] = df_ez_quote_o_betreiber['ebene'].fillna('-')
anteil_ez_pivot_o_betreiber = pd.pivot_table(df_ez_quote_o_betreiber, index=['buendel','ebene', 'lineshort'], columns='datum', values='anteil_ez').reset_index()
anteil_ez_pivot_o_betreiber

### Fahrten ohne Echtzeit Ebene 1/1+ und 2

In [54]:
df_fahrten_ohne_ez_ebenen_1_1p_2 = con.sql(f"""
        select f.datum, l.buendel, l.ebene, f.vu, f.fnr, f.lineshort,f.lineid_short, f.hasrealtime, f.journey_cancelled, f.reported_cancelled, f.ts_reported_cancelled
        
        from fahrten f
                                           
        left outer join linien l on f.lineid_short = l.dlid              
        where l.ebene in ('1', '1+') and f.hasrealtime = False and f.datum >= '{letzte14tage}'
                                           
        order by f.datum, f.lineid_short
        """).df()

In [55]:
df_fahrten_ausfall_1_1p_2 = con.sql(f"""
        select f.datum, l.buendel, l.ebene, f.vu, f.fnr, f.lineshort,f.lineid_short, f.hasrealtime, f.journey_cancelled, f.reported_cancelled, f.ts_reported_cancelled
        
        from fahrten f
                                    
        left outer join linien l on f.lineid_short = l.dlid              
        where l.ebene in ('1', '1+', '2') and (journey_cancelled = True or f.reported_cancelled = True) and 
        f.datum >= '{letzte14tage}'                            
        order by f.datum, f.lineid_short
        """).df()

# Ausgabe xlsx EZ Statistiken

In [56]:
xlsx = "/var/www/rt_archiv/anteil_echtzeit_linien_vbn.xlsx"
sn00 = '00 Hilfe'
sn01 = '01 pivot alle Linien betreiber'
sn02 = '02 pivot alle Linien'
sn03 = '03 fahrten ohne EZ 1 1+ 2'
sn04 = '04 fahrten ohne EZ 1 1+ 3 grup'
sn06 = '05 fahrten ausfall'
sn07 = '06 fahrten ausfall über HIM'
with pd.ExcelWriter(xlsx, engine="openpyxl") as writer:
    #Hilfeblatt
    writer.book.create_sheet(sn00)
    sheet = writer.book[sn00]
    sheet['A1'] = f"Erstellt: {dt.datetime.now().strftime('%Y-%m-%d %H:%M')} Zeitraum: {letzte14tage} bis {gestern}"

    sheet['A3'] = "Inhalt"
    sheet['B4'] = f"Blatt {sn01}: Pivot Echtzeitquote inkl. Betreiberkennung"
    sheet['B5'] = f"Blatt {sn02}: Pivot Echtzeitquote ohne Betreiberkennung"
    sheet['B6'] = f"Blatt {sn03}: Fahrten ohne Echtzeit"
    sheet['B7'] = f"Blatt {sn04}: Fahrten ohne Echtzeit mit Anzahl"
    sheet['B8'] = f"Blatt {sn06}: Fahrten Ausfall"
    sheet['B9'] = f"Blatt {sn07}: Fahrten Ausfall über HIM"

    #mit Kennung der Betreiber
    anteil_ez_pivot_betreiber.to_excel(writer, sheet_name=sn01, index=False)
    writer.book[sn01].freeze_panes = 'e2'
    writer.book[sn01].auto_filter.ref='A:H'
    for cell in writer.book[sn01]["1:1"]:
        cell.number_format = 'YYYY-MM-DD'
    writer.book[sn01].column_dimensions['c'].width = 22
    for c in ['D', 'E', 'F', 'G', 'H']:
        writer.book[sn01].column_dimensions[c].width = 22        
    for c in writer.book[sn01].iter_cols(min_col=4, max_col=anteil_ez_pivot_betreiber.shape[1]+4):
                #ermitteln der Spalte column letter
                cl = c[int(f"{anteil_ez_pivot_betreiber.shape[0]}")].column_letter
                writer.book[sn01].column_dimensions[cl].width = 16

    #Anteil EZ ohne Kennung der Betreiber
    anteil_ez_pivot_o_betreiber.to_excel(writer, sheet_name=sn02, index=False)
    writer.book[sn02].freeze_panes = 'd2'
    writer.book[sn02].auto_filter.ref='A:H'
    for cell in writer.book[sn02]["1:1"]:
        cell.number_format = 'YYYY-MM-DD'
    writer.book[sn02].column_dimensions['c'].width = 22
    for c in ['D', 'E', 'F', 'G', 'H']:
        writer.book[sn02].column_dimensions[c].width = 22 
         
    for c in writer.book[sn02].iter_cols(min_col=4, max_col=anteil_ez_pivot_o_betreiber.shape[1]+4):
                #ermitteln der Spalte column letter
                cl = c[int(f"{anteil_ez_pivot_o_betreiber.shape[0]}")].column_letter
                writer.book[sn02].column_dimensions[cl].width = 16

    ## Ausgabe der Fahrten ohne Echtzeit Ebene 1 und 1+ und 2 einzeln
    df_fahrten_ohne_ez_ebenen_1_1p_2.to_excel(writer, sheet_name=sn03, index=False)
    writer.book[sn03].freeze_panes = 'a2'
    writer.book[sn03].auto_filter.ref='A:M'
    for cell in writer.book[sn03]["A"]:
        cell.number_format = 'YYYY-MM-DD'
    writer.book[sn03].column_dimensions['A'].width = 18

    ## Ausgabe der Fahrten ohne Echtzeit Ebene 1 und 1+ und 2 gruppiert mit Anzahl
    df_fahrten_ohne_ez_ebenen_1_1p_2[['vu', 'fnr']].value_counts().reset_index().sort_values(['count', 'vu'], ascending=False).to_excel(writer, sheet_name=sn04, index=False)
    writer.book[sn04].freeze_panes = 'a2'
    writer.book[sn04].auto_filter.ref='A:H'
    writer.book[sn04].column_dimensions['A'].width = 22   

    ## Ausgabe der Fahrten Ausfall Ebene 1, 1+ und 2
    df_fahrten_ausfall_1_1p_2.to_excel(writer, sheet_name=sn06, index=False)
    writer.book[sn06].freeze_panes = 'a2'
    writer.book[sn06].auto_filter.ref='A:M'
    for cell in writer.book[sn06]["A"]:
        cell.number_format = 'YYYY-MM-DD'
    writer.book[sn06].column_dimensions['A'].width = 18

    ## Ausgabe der Fahrten Ausfall über HIM
    df_fahrten_ausfall_him.to_excel(writer, sheet_name=sn07, index=False)
    writer.book[sn07].freeze_panes = 'a2'
    writer.book[sn07].auto_filter.ref='A:M'
    for cell in writer.book[sn07]["A"]:
        cell.number_format = 'YYYY-MM-DD'
    writer.book[sn07].column_dimensions['A'].width = 18

In [None]:
df_stat_rt_canc = con.sql("""
        select 
            vu, 
            count(*) as anzahl, 
            count(*) filter (hasRealtime) as hasRealtime, 
            count(*) filter (realtimeHasEverBeenReported) as realtimeHasEverBeenReported,
            count(*) filter (realtimehaseverbeenreported or hasrealtime) as rt_combined,
            count(*) filter (journey_cancelled) as journey_cancelled,
            count(*) filter (reported_cancelled) as reported_cancelled
        from fahrten
        where datum >= (current_date - interval 3 days)
        group by all
        order by vu""").df()
df_stat_rt_canc

In [58]:
output_file = 'reports/df_stat_rt_canc.xlsx'
sheet_name = 'Stat RT Canc'
df_stat_rt_canc = df_stat_rt_canc.sort_values(by='vu')
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    df_stat_rt_canc.to_excel(writer, index=False, sheet_name=sheet_name)
    worksheet = writer.book[sheet_name]
    worksheet.auto_filter.ref = worksheet.dimensions
    worksheet.column_dimensions['A'].width = 30  # Set the width of column A to 30
    worksheet.freeze_panes = 'A2'  # Freeze the first row
    len = df_stat_rt_canc.shape[0]  # Get the number of rows
    worksheet[f'B{len+3}'] = f'=subtotal(9,B2:B{len + 1})'  # Add a sum formula for column B
    worksheet[f'C{len+3}'] = f'=subtotal(9,C2:C{len + 1})'  # Add a sum formula for column C
    worksheet[f'D{len+3}'] = f'=subtotal(9,D2:D{len + 1})'  # Add a sum formula for column D
    worksheet[f'E{len+3}'] = f'=subtotal(9,E2:E{len + 1})'  # Add a sum formula for column E
    worksheet[f'F{len+3}'] = f'=subtotal(9,F2:F{len + 1})'  # Add a sum formula for column F
