In [None]:
# !which python
# !pip install pandas
# !pip install datetime
# !pip install scipy
# !pip install NumPy
# !pip install Matplotlib
# !pip install scikit-learn

## Import and loading in data

In [247]:
import json
import time
import copy
import math
import numpy
import pandas as pd
from datetime import datetime

# importing the csv data about the store sales, 906 stores with 11936 timestamps.
raw_csv = pd.read_csv('sales_granular.csv', index_col=0)

# importing the surroundings information about some 546 stores, each with 89 possible types of surroundings.
raw = json.load(open('Surroundings.json'))

# manual inspection of JSON file by writing a single item in the list to disk.
with open('extract.json', 'w') as outfile:
    json.dump(raw[0], outfile)


## Helper functions

In [248]:
def group_by_month(timestamp_str):
    
    space_idx = timestamp_str.find(' ')
    parse_date_string = timestamp_str[:space_idx]
    
    date_datetime = datetime.strptime(parse_date_string, '%m/%d/%y')

    if len(str(date_datetime.month)) == 1:
        mm = '0' + str(date_datetime.month)
    else:
        mm = str(date_datetime.month)
        
    yyyy = str(date_datetime.year)
    
    month_key = '{0}-{1}'.format(yyyy, mm)
    return month_key

In [511]:
def get_series_stats(series, store_id):
    
    sales_points_sum = 0
    sales_points_valid = []

    for _, val in enumerate(series):
        if math.isnan(val):
            continue
        else:
            sales_points_sum += val
            sales_points_valid.append(val)

    series_mean = float(sales_points_sum) / max(len(sales_points_valid), 1)
    series_stdev = max(round(math.sqrt(float(reduce(lambda x, y: x + y, map(lambda x: (x - series_mean) ** 2, sales_points_valid))) / len(sales_points_valid)), 2), 1)
    
    return dict({
        'mean': round(series_mean, 2),
        'months_of_data_count': len(sales_points_valid),
        'store_id': store_id,
        'total_products_sold': sales_points_sum,
        'stdev': series_stdev
        })
    

In [527]:
from sklearn.preprocessing import normalize
    
def normalize_df(df):

    normalized_df = copy.deepcopy(df)
    norm1 = normalize(normalized_df, axis=0, norm='max')
    
    return(norm1)import numpy as np

# borrowed from https://plot.ly/python/polygon-area/

def PolygonSort(corners):
    n = len(corners)
    cx = float(sum(x for x, y in corners)) / n
    cy = float(sum(y for x, y in corners)) / n
    cornersWithAngles = []
    for x, y in corners:
        an = (np.arctan2(y - cy, x - cx) + 2.0 * np.pi) % (2.0 * np.pi)
        cornersWithAngles.append((x, y, an))
    cornersWithAngles.sort(key = lambda tup: tup[2])
    return map(lambda (x, y, an): (x, y), cornersWithAngles)

def PolygonArea(corners):
    n = len(corners)
    area = 0.0
    for i in range(n):
        j = (i + 1) % n
        area += corners[i][0] * corners[j][1]
        area -= corners[j][0] * corners[i][1]
    area = abs(area) / 2.0
    return area

# corners = [(13.54, 11.66), (9.05, 14.11), (4.65, 12.44), (7.17, 7.91)]
corners = [(2.2, 5.4), (5.4, 4.4), (4.7, 1.7), (1.4, 2.5)]
corners_sorted = PolygonSort(corners)
area = PolygonArea(corners_sorted)

area

## Data Transformation

In [514]:
# aggregated all the sales data per month based on the column names.
# sorted key list now contains indices per corresponsing month, used later for plotting and regressions.
atomic_timestamps = list(raw_csv)

month_dict = {}

for idx, col_timestamp in enumerate(atomic_timestamps):
    current_month = group_by_month(col_timestamp)
    
    if month_dict.has_key(current_month):
        month_dict[current_month].append(idx)
    else:
        month_dict[current_month] = []
        month_dict[current_month].append(idx)

sorted_key_list = sorted(month_dict.keys())

store_id_list = list(raw_csv.index)


# generate a dictionary of store_id with their appropriate series values (interpretable by month)
store_dict = {}

for i in range(0, len(raw_csv.index)):
    monthly_series = []
    for _, key in enumerate(sorted_key_list):
        monthly_series.append(raw_csv.iloc[i][month_dict[key][0]:month_dict[key][-1]].sum(skipna=True))
        
    store_dict[store_id_list[i]] = monthly_series


## Create table of relevant statistics in a time-series manner

In [525]:
stats_df = pd.DataFrame(columns= ['store_id', 'total_products_sold', 'mean', 'stdev', 'months_of_data_count'])

for key in store_dict.keys():
    row_dict = get_series_stats(store_dict[key], key) 
    row_df = pd.DataFrame.from_records(row_dict, index=[0])[['store_id', 'total_products_sold', 'mean', 'stdev', 'months_of_data_count']]
    stats_df = pd.concat([stats_df, row_df])

stats_df.reset_index(drop=True, inplace=True)

result = stats_df.sort_values(['months_of_data_count', 'total_products_sold', 'stdev'], ascending=[0, 0, 1])
# result = stats_df.sort_values(['months_of_data_count', 'mean', 'stdev'], ascending=[0, 0, 1])


# preparing the surroundings data for analysis
amenities_array = raw[0]['surroundings'].keys()

column_names_amenities = copy.deepcopy(amenities_array)
column_names_amenities = ['store_id'] + column_names_amenities

full_feature_amenities_df = pd.DataFrame(columns = column_names_amenities, index=[0])


for _, surroundings_obj in enumerate(raw):
    
    amenities_feature_dict = {}
    store_id = surroundings_obj['store_code']
    amenities_feature_dict['store_id'] = store_id
    
    for _, key in enumerate(amenities_array):
        amenities_feature_dict[key] = len(surroundings_obj['surroundings'][key])  
        
    feature_amenities_row = pd.DataFrame(data = amenities_feature_dict, columns = column_names_amenities, index=[0])
    
    full_feature_amenities_df = pd.concat([full_feature_amenities_df, feature_amenities_row])

full_feature_amenities_df = full_feature_amenities_df[1:]
full_feature_amenities_df.reset_index(drop=True, inplace=True)

# calculate the mean of months which contribute towards the total sales
# average is 9...
print(reduce(lambda x, y: x + y, result['months_of_data_count']) / len(result['months_of_data_count']))

9


## Preparing fitting and validation data sets

In [548]:
# select the store ids that are only in the surrounding dataset
ops_df = result.loc[result['store_id'].isin(full_feature_amenities_df['store_id'])]

# join on store_id key to append the total_products_sold
merged_df = pd.merge(full_feature_amenities_df, ops_df[['store_id', 'total_products_sold']], on='store_id')
# merged_df = pd.merge(full_feature_amenities_df, ops_df[['store_id', 'mean']], on='store_id')

# make sure no NA rows are present (same dataset with or without the drop, but it's good practice)
merged_df.dropna()

# sample random index numbers for splitting the dataset. Will go for 85% training and 15% testing.
msk = np.random.rand(len(merged_df)) <= 0.85

train_df = merged_df[msk]

test_df = merged_df[~msk]

In [551]:
training_fit_set_columns = [col for col in train_df.columns if col not in ['store_id', 'total_products_sold']]
training_fit_df = normalize_df(train_df[training_fit_set_columns])
training_target_df = train_df['total_products_sold']

test_fit_set_columns = [col for col in test_df.columns if col not in ['store_id', 'total_products_sold']]
test_fit_df = normalize_df(test_df[test_fit_set_columns])
test_target_df = test_df['total_products_sold']

from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=5, weights='distance')
neigh.fit(training_fit_df, training_target_df) 
neigh.predict(test_fit_df)

# current score is relatively poor: -0.407. This is probably caused by the data about the stores. 
neigh.score(test_fit_df, test_target_df)

0.28003663695980519

In [447]:
# From here on we are trimming the dataset to create a more reasonable model.
trimmed_result_df = result.loc[result['months_of_data_count'] > 9]
len(trimmed_result_df)

270

## Impriving data for modeling

In [509]:
amenities_array = raw[0]['surroundings'].keys()

column_names_amenities = copy.deepcopy(amenities_array)
column_names_amenities = ['store_id'] + column_names_amenities

full_feature_amenities_df = pd.DataFrame(columns = column_names_amenities, index=[0])

for _, surroundings_obj in enumerate(raw):
    
    amenities_feature_dict = {}
    store_id = surroundings_obj['store_code']
    amenities_feature_dict['store_id'] = store_id
    
    for _, key in enumerate(amenities_array):
        amenities_feature_dict[key] = len(surroundings_obj['surroundings'][key])  
        
    feature_amenities_row = pd.DataFrame(data = amenities_feature_dict, columns = column_names_amenities, index=[0])
    
    full_feature_amenities_df = pd.concat([full_feature_amenities_df, feature_amenities_row])

full_feature_amenities_df = full_feature_amenities_df[1:]
full_feature_amenities_df.reset_index(drop=True, inplace=True)

# select the store ids that are only in the surrounding dataset
ops_df = trimmed_result_df.loc[trimmed_result_df['store_id'].isin(full_feature_amenities_df['store_id'])]

# join on store_id key to append the total_products_sold
# merged_df = pd.merge(full_feature_amenities_df, ops_df[['store_id', 'total_products_sold']], on='store_id')
merged_df = pd.merge(full_feature_amenities_df, ops_df[['store_id', 'mean']], on='store_id')

# make sure no NA rows are present (same dataset with or without the drop, but it's good practice)
merged_df.dropna()

normalized_df = copy.deepcopy(merged_df)

# sample random index numbers for splitting the dataset. Will go for 85% training and 15% testing.
msk = np.random.rand(len(merged_df)) <= 0.85

train_df = merged_df[msk]

test_df = merged_df[~msk]


## Fitting the model and checking results

In [510]:
training_fit_set_columns = [col for col in train_df.columns if col not in ['store_id', 'mean']]
training_fit_df = train_df[training_fit_set_columns]
training_target_df = train_df['mean']

test_fit_set_columns = [col for col in test_df.columns if col not in ['store_id', 'mean']]
test_fit_df = test_df[test_fit_set_columns]
test_target_df = test_df['mean']

from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=5, weights='distance')
neigh.fit(training_fit_df, training_target_df) 
neigh.predict(test_fit_df)

# current score is very poor: +0.4507. This is probably caused by the data about the stores. 
neigh.score(test_fit_df, test_target_df)

-0.2824298250959405

In [450]:
## next steps:
# Inspect the current dataset to see if there are any missing values in the total Numbers <> CHECK!
# Create test and training set + validation <> CHECK!
# Normalize the values per column (variable) between (0; 1). <> CHECK!
# Scale the Total sales by number of months taken into account and observe the difference to the model <> trimmed data with insufficient number of months

# if time allows (it's before 10:30) then inspect the other feature vectors. Include other feature vectors & vars (see how) and include map if possible.
## talk about it in the presentation only..

## extra option: make regression based on differences to cosine similarity to a benchmark, count of months and map area.



      store_id subway_station department_store    embassy beauty_salon  \
0     0.044068              0                0          0    0.0305068   
1    0.0441732              0         0.078811          0    0.0305068   
2    0.0454792              0         0.157622  0.0601929     0.106774   
3    0.0457597              0         0.078811          0     0.122027   
4    0.0462287              0                0          0    0.0533869   
5    0.0476443       0.316228         0.078811          0    0.0152534   
6    0.0478941              0                0          0            0   
7    0.0483324              0         0.078811          0     0.175414   
8    0.0483324              0         0.078811          0     0.190668   
9    0.0506815              0         0.315244   0.180579     0.259308   
10   0.0507078       0.316228         0.315244   0.180579     0.251681   
11   0.0508524       0.632456                0  0.0601929    0.0457602   
12   0.0508699       0.632456         

In [None]:
## Ploting code for validation / exploration.

In [None]:
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

# plotting commands

plt.plot(monthly_series)
plt.show()

print(raw_csv.iloc[i][month_dict[key][0]:month_dict[key][-1]].describe())
print('\n\n')
break