## Import Modules

In [1]:
# Set paths
import os
import random
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np

# Geolocation
import geonamescache

# Custom package for data preprocessing
import preprocessing as pp

# %load_ext autoreload
# %autoreload 2

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

## Customer Data

In [3]:
df_customers = pd.read_csv(filepaths.raw_customers_data, sep=';')

## Employee Data

In [4]:
df_employees = pd.read_csv(filepaths.raw_employees_data, sep=';')
date_list = ['HireDate','BirthDate']

df_employees = pp.preprocess_employee_data(df_employees, date_list, filepaths.interim_employees_data)

## Country Data

In [5]:
df_countries = pd.read_csv(filepaths.raw_countries_data, sep=';')
feature_list = ['CountryCode']
nan_replacements = 'AV'

df_countries = pp.preprocess_country_data(df_countries, feature_list, nan_replacements, filepaths.interim_countries_data)

## City Data

In [6]:
df_cities = pd.read_csv(filepaths.raw_cities_data, sep=';')
df_us_regions = pd.read_csv(filepaths.external_us_regions_data, sep=',')
city_list = df_cities['CityName'].to_list()
col_name = 'State'
feature_list = ['State']
nan_replacements = '0'

df_cities = pp.preprocess_city_data(df_cities, df_us_regions, city_list, col_name, feature_list, nan_replacements, filepaths.interim_cities_data)

## Products Data

In [7]:
df_products = pd.read_csv(filepaths.interim_products_data_v1, sep=';')
date_list = ['ModifyDate']
feature_list = ['Resistant', 'IsAllergic']
nan_replacements = '_blank'

df_products = pp.preprocess_product_data(df_products, date_list, feature_list, nan_replacements, filepaths.interim_products_data_v2)

## Sales Data

In [8]:
df_sales = pd.read_csv(filepaths.raw_sales_data, sep=';')
date_list = ['SalesDate']

df_sales = pp.preprocess_sales_data(df_sales, date_list, filepaths.interim_sales_data)