In [None]:
# !which python
# !pip install pandas
# !pip install datetime
# !pip install scipy
# !pip install NumPy
# !pip install Matplotlib
# !pip install scikit-learn

In [247]:
import json
import time
import copy
import math
import numpy
import pandas as pd
from datetime import datetime

# importing the csv data about the store sales, 906 stores with 11936 timestamps.
raw_csv = pd.read_csv('sales_granular.csv', index_col=0)

# importing the surroundings information about some 546 stores, each with 89 possible types of surroundings.
raw = json.load(open('Surroundings.json'))

# manual inspection of JSON file by writing a single item in the list to disk.
with open('extract.json', 'w') as outfile:
    json.dump(raw[0], outfile)


In [248]:
def group_by_month(timestamp_str):
    
    space_idx = timestamp_str.find(' ')
    parse_date_string = timestamp_str[:space_idx]
    
    date_datetime = datetime.strptime(parse_date_string, '%m/%d/%y')

    if len(str(date_datetime.month)) == 1:
        mm = '0' + str(date_datetime.month)
    else:
        mm = str(date_datetime.month)
        
    yyyy = str(date_datetime.year)
    
    month_key = '{0}-{1}'.format(yyyy, mm)
    return month_key

In [249]:
def get_series_stats(series, store_id):
    
    sales_points_sum = 0
    sales_points_valid = []

    for _, val in enumerate(series):
        if math.isnan(val):
            continue
        else:
            sales_points_sum += val
            sales_points_valid.append(val)

    series_mean = float(sales_points_sum) / max(len(sales_points_valid), 1)
    series_stdev = max(round(math.sqrt(float(reduce(lambda x, y: x + y, map(lambda x: (x - series_mean) ** 2, sales_points_valid))) / len(sales_points_valid)), 2), 1)
    
    return dict({
        'mean': round(series_mean, 2),
        'months_of_data_count': len(sales_points_valid),
        'store_id': store_id,
        'total_products_sold': sales_points_sum,
#         'stdev_perc': int(series_stdev / (max(series_mean, 1))),
        'stdev': series_stdev
        })
    

In [250]:
# aggregated all the sales data per month based on the column names.
# sorted key list now contains indices per corresponsing month, used later for plotting and regressions.
atomic_timestamps = list(raw_csv)

month_dict = {}

for idx, col_timestamp in enumerate(atomic_timestamps):
    current_month = group_by_month(col_timestamp)
    
    if month_dict.has_key(current_month):
        month_dict[current_month].append(idx)
    else:
        month_dict[current_month] = []
        month_dict[current_month].append(idx)

sorted_key_list = sorted(month_dict.keys())

store_id_list = list(raw_csv.index)


# generate a dictionary of store_id with their appropriate series values (interpretable by month)
store_dict = {}

for i in range(0, len(raw_csv.index)):
    monthly_series = []
    for _, key in enumerate(sorted_key_list):
        monthly_series.append(raw_csv.iloc[i][month_dict[key][0]:month_dict[key][-1]].sum(skipna=True))
        
    store_dict[store_id_list[i]] = monthly_series


In [251]:
### break point ###

In [252]:
stats_df = pd.DataFrame(columns= ['store_id', 'total_products_sold', 'mean', 'stdev', 'months_of_data_count'])

for key in store_dict.keys():
    row_dict = get_series_stats(store_dict[key], key) 
    row_df = pd.DataFrame.from_records(row_dict, index=[0])[['store_id', 'total_products_sold', 'mean', 'stdev', 'months_of_data_count']]
    stats_df = pd.concat([stats_df, row_df])

stats_df.reset_index(drop=True, inplace=True)
# stats_df
result = stats_df.sort_values(['months_of_data_count', 'total_products_sold', 'stdev'], ascending=[0, 0, 1])

In [253]:
result

Unnamed: 0,store_id,total_products_sold,mean,stdev,months_of_data_count
353,10998,4209750.0,183032.61,47500.03,23
685,67077,2818860.0,122559.13,33078.54,23
310,80556,2719680.0,118246.96,23583.90,23
743,63144,2675580.0,116329.57,31746.00,23
86,84146,2143710.0,93204.78,24950.38,23
601,46416,1285470.0,55890.00,13361.42,23
651,69193,1078500.0,46891.30,9451.32,23
205,27059,1010670.0,43942.17,18001.20,23
303,81690,748950.0,32563.04,5830.63,23
407,10391,737280.0,32055.65,10556.23,23


In [254]:
amenities_array = raw[0]['surroundings'].keys()

column_names_amenities = copy.deepcopy(amenities_array)
column_names_amenities = ['store_id'] + column_names_amenities

full_feature_amenities_df = pd.DataFrame(columns = column_names_amenities, index=[0])


for _, surroundings_obj in enumerate(raw):

#     print(len(surroundings_obj['surroundings']['subway_station']))
    
    amenities_feature_dict = {}
    store_id = surroundings_obj['store_code']
    amenities_feature_dict['store_id'] = store_id
    
    for _, key in enumerate(amenities_array):
        amenities_feature_dict[key] = len(surroundings_obj['surroundings'][key])  
        
    feature_amenities_row = pd.DataFrame(data = amenities_feature_dict, columns = column_names_amenities, index=[0])
    
    full_feature_amenities_df = pd.concat([full_feature_amenities_df, feature_amenities_row])


In [259]:
full_feature_amenities_df = full_feature_amenities_df[1:]
full_feature_amenities_df.reset_index(drop=True, inplace=True)
# full_feature_amenities_df

In [270]:
ops_df = result.loc[result['store_id'].isin(full_feature_amenities_df['store_id'])]

merged_df = pd.merge(full_feature_amenities_df, ops_df[['store_id', 'total_products_sold']], on='store_id')

merged_df.dropna()

test_cols = [col for col in merged_df.columns if col not in ['store_id', 'total_products_sold']]
merged_df2 = merged_df[test_cols]

In [275]:
merged_df2

Unnamed: 0,subway_station,department_store,embassy,beauty_salon,police,courthouse,cemetery,pharmacy,local_government_office,shopping_mall,...,storage,zoo,train_station,jewelry_store,laundry,insurance_agency,plumber,pet_store,bakery,travel_agency
0,0,1,0,4,0,1,0,3,1,1,...,0,0,0,4,2,2,0,0,3,2
1,0,0,0,2,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,2,1,14,0,0,1,5,3,3,...,0,0,0,50,0,0,0,0,12,14
4,0,1,0,16,0,0,0,5,6,3,...,0,0,1,6,2,3,0,1,6,9
5,0,0,0,7,1,0,0,4,1,0,...,0,0,1,2,0,2,0,0,4,3
6,0,0,0,7,0,0,0,3,1,0,...,0,0,0,3,0,3,0,0,5,3
7,0,0,0,0,0,0,0,1,2,0,...,0,0,1,0,0,1,0,0,4,0
8,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [326]:
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=5, weights='distance')
neigh.fit(merged_df2, merged_df['total_products_sold']) 


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='distance')

In [327]:
print(neigh.predict([merged_df2.iloc[505]]))
merged_df.iloc[505]['total_products_sold']

[ 72630.]


72630.0

In [None]:
## Ploting code for validation / exploration.

In [None]:
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

# plotting commands

plt.plot(monthly_series)
plt.show()

print(raw_csv.iloc[i][month_dict[key][0]:month_dict[key][-1]].describe())
print('\n\n')
break

In [137]:
import numpy as np

# borrowed from https://plot.ly/python/polygon-area/

def PolygonSort(corners):
    n = len(corners)
    cx = float(sum(x for x, y in corners)) / n
    cy = float(sum(y for x, y in corners)) / n
    cornersWithAngles = []
    for x, y in corners:
        an = (np.arctan2(y - cy, x - cx) + 2.0 * np.pi) % (2.0 * np.pi)
        cornersWithAngles.append((x, y, an))
    cornersWithAngles.sort(key = lambda tup: tup[2])
    return map(lambda (x, y, an): (x, y), cornersWithAngles)

def PolygonArea(corners):
    n = len(corners)
    area = 0.0
    for i in range(n):
        j = (i + 1) % n
        area += corners[i][0] * corners[j][1]
        area -= corners[j][0] * corners[i][1]
    area = abs(area) / 2.0
    return area

# corners = [(13.54, 11.66), (9.05, 14.11), (4.65, 12.44), (7.17, 7.91)]
corners = [(2.2, 5.4), (5.4, 4.4), (4.7, 1.7), (1.4, 2.5)]
corners_sorted = PolygonSort(corners)
area = PolygonArea(corners_sorted)

area

9.775000000000004

In [None]:
### Code which didn't make it into the analysis

class StoreSalesMonth():
    
    def __init__(self):
        
        # timestamp property
        self.timestamp_components_number = 0        
        self.timestamp_component_sales_array = []
        
        # date property
        self.current_month = 0
        self.current_date_string = ''
        self.current_date_datetime = None
        
    def TotalMonthlySales(self):
        return sum(self.timestamp_component_sales_array)
        

store_sales_array = []
        

# first get the column names
atomic_timestamps = list(raw_csv)

current_month = None
timestamp_count

# instanciate StoreSalesMonth object
store_monthly_sales_obj = StoreSalesMonth()

# currently working with a single store (the first one!)
for idx, raw_date in enumerate(atomic_timestamps[0]):

    space_idx = raw_date.find(' ')
        
    store_monthly_sales_obj.current_date_string = raw_date[:space_idx]
    store_monthly_sales_obj.current_date_datetime = datetime.strptime(store_monthly_sales_obj.current_date, '%m/%d/%y')
    store_monthly_sales_obj.current_month = store_monthly_sales_obj.current_date_datetime.month
    
    if current_month == None:
        current_month = store_monthly_sales_obj.current_month
        store_monthly_sales_obj.timestamp_components_number += 1
        store_monthly_sales_obj.timestamp_component_sales_array.append(raw_csv[raw_date][idx])
        continue
        
    else:
        current_date_string = raw_date[:space_idx]
        current_month = datetime.strptime(current_date_string, '%m/%d/%y').month
        
        if current_month
        
        