In [13]:
import pandas as pd
import warnings
from geocode import concat_full_address, fetch_geodata

Load the data from `battery.csv` into a pandas data frame. 

In [14]:
# importing raw data. This is the original excel data grs provided us exported as csv.
raw_df = pd.read_csv('../data/battery.csv', sep=';')

In [None]:
df = raw_df.copy()
df.head() # cell outputs cleared

In [None]:
df.shape # shape redacted

In [None]:
# Check data info and show the data type of each column
print(df.info()) # description redacted

## Fix column names formats

We want to change the empty spaces to underscores. Also we want to use only low letters in the column names.

In [None]:
# replacing empty spaces with underscores in column names
df.columns = df.columns.str.replace(' ','_')
df.columns # description redacted

In [None]:
# changing all column names to lower case
df.columns = df.columns.str.lower()
df.columns # description redacted

## Imputing Latitude and Longitude
Test run with few data :)

In [20]:
warnings.simplefilter('ignore')
df_test = df.iloc[:10]
concat_full_address(df_test, address_col='volle_Addresse')
fetch_geodata(df_test, address_col='volle_Addresse')

In [None]:
df_test.head(3) # output redacted

In [22]:
# check if address data has missing values
df_test[['stasse', 'plz', 'ort', 'bundesland']].isna().sum()

stasse        0
plz           0
ort           0
bundesland    0
dtype: int64

Fetching  lat/long geocodes for all addresses. This will run 2-3 hours.

In [23]:
FETCH_ALL = False

if FETCH_ALL:
    concat_full_address(df, address_col='volle_Addresse')
    fetch_geodata(df, address_col='volle_Addresse')

In [24]:
df[['stasse', 'plz', 'ort', 'bundesland']].isna().sum()

stasse        3
plz           0
ort           0
bundesland    0
dtype: int64

In [25]:
# pickle will preserve the full state of a pandas dataframe including dtypes
if FETCH_ALL:
    df.to_pickle("../data/battery_with_geo.pkl")