# Mein Test-Notebook

In [17]:
import os

# TensorFlow is the only backend that supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import altair as alt
from IPython import display
import fiona
import geopandas as gpd
from shapely.geometry import Point
import pyogrio
from geopy.distance import distance
from sklearn.inspection import DecisionBoundaryDisplay
pd.options.display.max_columns = 50
pd.options.display.max_rows = 200
plt.rcParams['figure.figsize'] = [16, 9]
%matplotlib inline
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import logging
logging.getLogger('matplotlib.font_manager').setLevel(level=logging.CRITICAL)

In [2]:
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [None]:
dataframe = pd.read_csv('./atp_player_comp_list.csv', low_memory=False)
# Wir entfernen einige Features um das Problem übersichtlicher zu machen
#dataframe = dataframe.drop(['tourney_date', 'id'], axis=1)
# Wir entfernen diejenigen Passagiere, für die das Alter nicht angegeben ist
#dataframe = dataframe.dropna()
#dataframe = dataframe[dataframe["Age"].isna() == False]
# Features inkl. one-hot encoding der kategorischen
#dataframe = pd.get_dummies(dataframe)
val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
train_dataframe = dataframe.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

# Name of the label column.
label = ''

classes = dataframe[label].unique().tolist()
print(f"Label classes: {classes}")

In [3]:
df_stores = pd.read_csv('./data/dmml1_stores.csv')
df_train = pd.read_csv('./data/dmml1_train.csv')
df_test = pd.read_csv('./data/dmml1_test.csv')
df_train['Date'] = pd.to_datetime(df_train['Date'], format='%Y-%m-%d')
df_test['Date'] = pd.to_datetime(df_test['Date'], format='%Y-%m-%d')

In [14]:
df_train

Unnamed: 0,Store ID,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,4,2015-04-30,6884,716,1,1,0,0
1,1,3,2015-04-29,6764,756,1,1,0,0
2,1,2,2015-04-28,6861,678,1,1,0,0
3,1,1,2015-04-27,6523,647,1,1,0,0
4,1,7,2015-04-26,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
246898,300,6,2013-01-05,5194,569,1,0,0,0
246899,300,5,2013-01-04,5524,646,1,0,0,1
246900,300,4,2013-01-03,5563,718,1,0,0,1
246901,300,3,2013-01-02,6218,753,1,0,0,1


In [14]:
df_train.dtypes

Store ID                  int64
DayOfWeek                 int64
Date             datetime64[ns]
Sales                     int64
Customers                 int64
Open                      int64
Promo                     int64
StateHoliday             object
SchoolHoliday             int64
dtype: object

In [141]:
df_train = pd.read_csv('./data/dmml1_train.csv')
df_train['Date'] = pd.to_datetime(df_train['Date'], format='%Y-%m-%d')
df_train['SalesPerCustomer'] = df_train['Sales'] / df_train['Customers']
#df_train = df_train[df_train['DayOfWeek'] != 7]
df_train = df_train[df_train['StateHoliday'] == 'a']
#df_test = df_test[df_test['DayOfWeek'] != 7]
#df_test = df_test[df_test['StateHoliday'] == '0']
#df_test = df_test[df_test['Open'] == 0]
df_train_temp = df_train
#df_train_temp = df_train_temp[df_train_temp['Sales'] == 0]
df_train_temp = df_train_temp[df_train_temp['Open'] == 1]

In [142]:
alt.Chart(df_train_temp).mark_circle().encode(
    x='Date:T',
    y="Store ID:Q",
    tooltip=['Store ID', 'Date', 'DayOfWeek', 'StateHoliday']

).properties(
    width=1200,
    height=600
)

In [132]:
df_train_temp.groupby('Store ID').count()

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,SalesPerCustomer
Store ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19,6,6,6,6,6,6,6,6,6
30,3,3,3,3,3,3,3,3,3
73,3,3,3,3,3,3,3,3,3
168,6,6,6,6,6,6,6,6,6
218,3,3,3,3,3,3,3,3,3
238,3,3,3,3,3,3,3,3,3
297,6,6,6,6,6,6,6,6,6


In [135]:
df_train_temp.groupby('Store ID').count()

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,SalesPerCustomer
Store ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19,4,4,4,4,4,4,4,4,4
73,1,1,1,1,1,1,1,1,1
168,4,4,4,4,4,4,4,4,4
297,4,4,4,4,4,4,4,4,4


In [138]:
df_train_temp.groupby('Store ID').count()

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,SalesPerCustomer
Store ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2,2,2,2,2,2,2,2,2
14,2,2,2,2,2,2,2,2,2
19,11,11,11,11,11,11,11,11,11
24,2,2,2,2,2,2,2,2,2
29,2,2,2,2,2,2,2,2,2
30,6,6,6,6,6,6,6,6,6
37,1,1,1,1,1,1,1,1,1
38,1,1,1,1,1,1,1,1,1
44,2,2,2,2,2,2,2,2,2
66,1,1,1,1,1,1,1,1,1


In [105]:
df_train_temp

Unnamed: 0,Store ID,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,SalesPerCustomer
75178,92,1,2014-09-22,0,0,1,0,0,0,
80484,98,3,2013-08-28,0,0,1,1,0,1,
121151,147,1,2013-04-29,0,0,1,1,0,0,
135007,165,5,2014-04-04,0,0,1,1,0,0,
159363,194,4,2013-01-17,0,0,1,0,0,0,
182357,223,3,2014-07-23,0,0,1,0,0,1,
194606,237,4,2013-08-08,0,0,1,0,0,1,
209743,255,6,2014-01-18,0,0,1,0,0,0,
209744,255,5,2014-01-17,0,0,1,0,0,0,
226382,275,3,2013-07-10,0,0,1,0,0,0,


In [145]:
clf = Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=16))]
)

In [None]:
_, axs = plt.subplots(ncols=2, figsize=(12, 5))

for ax, weights in zip(axs, ("uniform", "distance")):
    clf.set_params(knn__weights=weights).fit(X_train, y_train)
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X_test,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
        shading="auto",
        alpha=0.5,
        ax=ax,
    )
    scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors="k")
    disp.ax_.legend(
        scatter.legend_elements()[0],
        iris.target_names,
        loc="lower left",
        title="Classes",
    )
    _ = disp.ax_.set_title(
        f"3-Class classification\n(k={clf[-1].n_neighbors}, weights={weights!r})"
    )

plt.show()


In [54]:
filename = "./data/dm_filialen.geojson"
file = open(filename)
dm_filialen = gpd.read_file(file)
dm_filialen = dm_filialen[['addr:city', 'brand', 'geometry']]
dm_filialen['geometry'] = dm_filialen['geometry'].to_crs("EPSG:32632")
filename = "./data/rossmann_filialen.geojson"
file = open(filename)
rossmann_filialen = gpd.read_file(file)
rossmann_filialen = rossmann_filialen[['addr:city', 'brand', 'geometry']]
rossmann_filialen['geometry'] = rossmann_filialen['geometry'].to_crs("EPSG:32632")
drogerie_filialen = pd.concat([rossmann_filialen, dm_filialen])

In [55]:
dm_filialen['geometry'] = dm_filialen.geometry.representative_point()
rossmann_filialen['geometry'] = rossmann_filialen.geometry.representative_point()
drogerie_filialen['geometry'] = drogerie_filialen.geometry.representative_point()

In [68]:
results = rossmann_filialen.geometry.distance(dm_filialen['geometry'][1])

In [75]:
# Liste für Treffer mit mindestens 50 km Entfernung
result_list = []

# Schleife, um für jede Rossmann-Filiale den am nächsten gelegenen Konkurrenten zu finden
for rossmann_index, rossmann_row in rossmann_filialen.iterrows():
    result_list_temp=[]
    rossmann_name = rossmann_row["addr:city"]

    #for dm_index, dm_row in dm_filialen.iterrows():
        # Berechne die Entfernungen zu allen Konkurrenten
    distances = dm_filialen.geometry.distance(rossmann_row["geometry"])
        # Finde den am nächsten gelegenen Konkurrenten
    nearest_competitor_index = distances.idxmin()
    nearest_competitor_distance = distances.loc[nearest_competitor_index]

    # Füge Treffer zur Ergebnisliste hinzu, wenn die Entfernung mindestens 50 km beträgt
    #if nearest_competitor_distance >= 50:
    result_list.append((rossmann_name, nearest_competitor_index, nearest_competitor_distance))

# Gib die Treffer aus
for result in result_list:
    rossmann_name, competitor_index, distance, = result
    competitor_name = dm_filialen.loc[competitor_index, "brand"]
    competitor_ort = dm_filialen.loc[competitor_index, "addr:city"]
    print(f"{rossmann_name} ist mindestens 50 km entfernt von {competitor_name} in {competitor_ort} (Entfernung: {distance} km)")

Blankenfelde-Mahlow ist mindestens 50 km entfernt von dm in Berlin (Entfernung: 8093.282181238841 km)
Berlin ist mindestens 50 km entfernt von dm in Berlin (Entfernung: 3758.3740709217664 km)
Kiel ist mindestens 50 km entfernt von dm in Schwentinental (Entfernung: 2398.4653191184143 km)
Oberursel (Taunus) ist mindestens 50 km entfernt von dm in Oberursel (Taunus) (Entfernung: 1023.1938131989363 km)
Bodenheim ist mindestens 50 km entfernt von dm in None (Entfernung: 6214.1935139813695 km)
Erbach ist mindestens 50 km entfernt von dm in Erbach (Entfernung: 1337.367034014194 km)
Flensburg ist mindestens 50 km entfernt von dm in Flensburg (Entfernung: 1862.659436446645 km)
Nabburg ist mindestens 50 km entfernt von dm in Wernberg-Köblitz (Entfernung: 9680.310534452063 km)
Bad Frankenhausen/Kyffhäuser ist mindestens 50 km entfernt von dm in Sangerhausen (Entfernung: 17468.5234591758 km)
Leck ist mindestens 50 km entfernt von dm in None (Entfernung: 12064.062940556207 km)
Parsberg ist mindeste

In [74]:
results.idxmin()

1759

In [83]:
dm_filialen[:1761]

Unnamed: 0,addr:city,brand,geometry
0,Florstadt,dm,POINT (495566.400 5575487.682)
1,Rheinberg,dm,POINT (333341.683 5713998.922)
2,Mainburg,dm,POINT (705243.931 5392055.704)
3,Vaihingen an der Enz,dm,POINT (497579.896 5419304.244)
4,Hamburg,dm,POINT (569668.473 5937843.336)
...,...,...,...
1756,,dm,POINT (439011.928 5466978.789)
1757,Leipzig,dm,POINT (729145.090 5690435.083)
1758,München,dm,POINT (689565.611 5334610.785)
1759,,dm,POINT (573572.792 6019465.262)


In [52]:
result = pd.DataFrame(rossmann_filialen.sindex.nearest(dm_filialen.geometry))

In [53]:
result.head

<bound method NDFrame.head of    0     1     2     3     4     5     6     7     8     9     10    11    \
0     0     1     2     3     4     5     6     7     8     9    10    11   
1  2144  1759  1767  1707  1977   419  1927  1644   114   636   541  1397   

   12    13    14    15    16    17    18    19    20    21    22    23    \
0    12    13    14    15    16    17    18    19    20    21    22    23   
1  1397   692   776   353  1982   362   765  1399  1409  1363   480   934   

   24    ...  2003  2004  2005  2006  2007  2008  2009  2010  2011  2012  \
0    24  ...  2003  2004  2005  2006  2007  2008  2009  2010  2011  2012   
1  1637  ...  1669   638   234   974  1389  1112   785  1721  1880  1154   

   2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  2023  2024  \
0  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  2023  2024   
1  1282   264  1785   506  1762   248   139   525  1527  1828  1081  1362   

   2025  2026  2027  
0  2025  2026  2027  
1