In [17]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import time
import seaborn as sns
from matplotlib import pyplot as plt
import sqlalchemy
from sqlalchemy import create_engine
import pymysql
import os
from dotenv import load_dotenv
%load_ext sql
import prediction

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


#### functions we can not yet put in a module (have to copy for every notebook)

In [18]:
def current_milli_time():
    return round(time.time() * 1000)

In [19]:
def create_collection_point_dataframes(data, unique_names=True):
    """_summary_
    Creates a lot of dataframes split by collection point number (übergabestellennummer)
     in the global namespace. Using the returned list of dataframe names you can call
        df = globals()[collection_point_df_name_list[x]] to get any of the dataframes.
    Parameters
    ----------
    data : _type_
        _description_

    Returns
    -------
    _type_
        _description_
    list of dataframe names that are created globally.
    """  
    collection_points = list(data.übergabestellennummer.unique())
    collection_points.sort()
    # create a list for the variable names
    collection_point_df_name_list = list()

    for cp in collection_points:
        if(unique_names):
            dataframe_name = str(current_milli_time()) + "_orders_collection_point_" + cp.astype(str)
        else:
            dataframe_name = "orders_collection_point_" + cp.astype(str)
        collection_point_df_name_list.append(dataframe_name)
        # The df is assigned to "orders_collection_point_<some-collection-point-id>" variable
        Dynamic_Variable_Name = dataframe_name
        globals()[Dynamic_Variable_Name] = data.query("übergabestellennummer == @cp")
    return collection_point_df_name_list

In [20]:
def create_db_table(cp_list, table, debug=False, debug_sample_size=1):
    
    number_collection_points = debug_sample_size
    count = 0
    error_list = list()
    for df_name in cp_list:
        if debug:
            if count >= number_collection_points:
                break
        df = globals()[df_name]
        if(debug):
            print(f"df_name: {df_name} has {df.shape[0]} rows")
        df = prediction.drop_single_pick_ups_and_single_initial_deliveries(df)
        if df.shape[0] > 0:
            if(debug):
                print(f"df_name: {df_name} has {df.shape[0]} rows")
            stats_dict = prediction.predict_capacity_of_collection_point_full_date(df, debug=debug)
            if(debug):
                print(stats_dict)
            table = table.append(stats_dict, ignore_index = True)
            count += 1  
    
    return table

#### notebook code

In [None]:
orders = pd.read_pickle("../data/battery_cleaned_with_geo.pkl")
orders = prediction.filter_for_report(orders)
orders_comp, orders_open = prediction.filter_dataframe_for_prediction(orders)
orders_comp = prediction.remove_orders_with_unknown_weights(orders_comp)
orders_comp = orders_comp.sort_values(by=['konzernnummer','übergabestellennummer', 'abholdatum'], ascending=True)
cp_list = create_collection_point_dataframes(orders_comp, unique_names=False)

# redacted

In [None]:
# debug single collection point
cp = 109991
df = globals()["orders_collection_point_" + str(cp)]

prediction.predict_capacity_of_collection_point_full_date(df, debug=True)
df[["übergabestellennummer", "konzernnummer", "vertragsnummer", "auftragsnummer", "auftragsstatus", "auftragstyp", "abholdatum", "nettogewicht_in_kg", "angemeldete_containeranzahl", "angeforderter_behältertyp", "gelieferter_behältertyp", "gelieferte_behälteranzahl"]]

# redacted

In [23]:
table = pd.DataFrame()
db_data = create_db_table(cp_list, table, debug=False)

In [None]:
db_data
# redacted

In [None]:
db_data.info()
# redacted

In [None]:
# check for inf values
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = db_data.select_dtypes(include=numerics)
newdf
# redacted

In [27]:
# check for inf values

# printing column name where infinity is present
print()
print("printing column name where infinity is present")
col_name = newdf.columns.to_series()[np.isinf(newdf).any()]
print(col_name)
  
# printing row index with infinity
print()
print("printing row index with infinity ")
  
r = newdf.index[np.isinf(newdf).any(1)]
print(r)


printing column name where infinity is present
Series([], dtype: object)

printing row index with infinity 
Int64Index([], dtype='int64')


In [28]:
# show rows with inf values
db_data_checked = db_data[["übergabestellennummer", "tägl_smenge_kg", "erreicht_kg", "erreicht_prozent"]].iloc[r]
db_data_checked.shape[0]

0

### Store results into DB

In [29]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
load_dotenv()
DATABASE = os.getenv('DATABASE')
USER_DB = os.getenv('USER_DB')
PASSWORD = os.getenv('PASSWORD')
HOST = os.getenv('HOST')
PORT = os.getenv('PORT')

# connect to existing db
con_string = "mysql+pymysql://" + USER_DB + ":" + PASSWORD + "@" + HOST + "/" + DATABASE + "?charset=utf8mb4"

In [30]:
db = create_engine(con_string)
# test connection
pd.read_sql("show databases", db)

Unnamed: 0,Database
0,battery
1,information_schema
2,mysql
3,performance_schema
4,sys


In [31]:
TABLE_NAME = "STATUS_COLLECTION_POINT_2"
OVERWRITE_TABLE = False
if OVERWRITE_TABLE:
    pd.read_sql("drop table " + TABLE_NAME, db)
    db_data.to_sql(TABLE_NAME, db, if_exists='append', index=False)


In [33]:
db_data.to_excel("../data/status_sammelstellen.xlsx", sheet_name='status')  