In [3]:
import json
import math
import numpy
import pandas as pd
from datetime import datetime


# importing the csv data about the store sales, 906 stores with 11936 timestamps.
raw_csv = pd.read_csv('sales_granular.csv', index_col=0)

# importing the surroundings information about some 546 stores, each with 89 possible types of surroundings.
raw = json.load(open('Surroundings.json'))


# manual inspection of JSON file by writing a single item in the list to disk.
with open('extract.json', 'w') as outfile:
    json.dump(raw[0], outfile)


In [4]:
def group_by_month(timestamp_str):
    
    space_idx = timestamp_str.find(' ')
    parse_date_string = timestamp_str[:space_idx]
    
    date_datetime = datetime.strptime(parse_date_string, '%m/%d/%y')

    if len(str(date_datetime.month)) == 1:
        mm = '0' + str(date_datetime.month)
    else:
        mm = str(date_datetime.month)
        
    yyyy = str(date_datetime.year)
    
    month_key = '{0}-{1}'.format(yyyy, mm)
    return month_key

In [25]:
def get_series_stats(series, store_id):
    
    sales_points_sum = 0
    sales_points_valid = []

    for _, val in enumerate(series):
        if math.isnan(val):
            continue
        else:
            sales_points_sum += val
            sales_points_valid.append(val)

    series_mean = float(sales_points_sum) / max(len(sales_points_valid), 1)
    
            
    return dict({
        'mean': round(series_mean, 2),
        'count': len(sales_points_valid),
        'store_id': store_id,
        'total_products_sold': sales_points_sum,
        'stdev': round(math.sqrt(float(reduce(lambda x, y: x + y, map(lambda x: (x - series_mean) ** 2, sales_points_valid))) / len(sales_points_valid)), 2)
        })
    

In [6]:
# aggregated all the sales data per month based on the column names.
# sorted key list now contains indices per corresponsing month, used later for plotting and regressions.
atomic_timestamps = list(raw_csv)

month_dict = {}

for idx, col_timestamp in enumerate(atomic_timestamps):
    current_month = group_by_month(col_timestamp)
    
    if month_dict.has_key(current_month):
        month_dict[current_month].append(idx)
    else:
        month_dict[current_month] = []
        month_dict[current_month].append(idx)

sorted_key_list = sorted(month_dict.keys())

store_id_list = list(raw_csv.index)


# generate a dictionary of store_id with their appropriate series values (interpretable by month)
store_dict = {}

for i in range(0, len(raw_csv.index)):
    monthly_series = []
    for _, key in enumerate(sorted_key_list):
        monthly_series.append(raw_csv.iloc[i][month_dict[key][0]:month_dict[key][-1]].sum(skipna=True))
        
    store_dict[store_id_list[i]] = monthly_series


In [26]:
get_series_stats(store_dict[33111], 33111)

{'count': 10,
 'mean': 849.0,
 'stdev': 621.96,
 'store_id': 33111,
 'total_products_sold': 8490.0}

In [None]:
### break point ###

In [29]:
ex = get_series_stats(store_dict[33111], 33111)

df_row = pd.DataFrame.from_records(ex, index=[0])[['store_id', 'total_products_sold', 'mean', 'stdev', 'count']]
df_row

Unnamed: 0,store_id,total_products_sold,mean,stdev,count
0,33111,8490.0,849.0,621.96,10


In [43]:
stats_df = pd.DataFrame(columns= ['store_id', 'total_products_sold', 'mean', 'stdev', 'count'])

print(stats_df)

for key in store_dict.keys():
    row_dict = get_series_stats(store_dict[key], key) 
    row_df = pd.DataFrame.from_records(row_dict, index=[0])[['store_id', 'total_products_sold', 'mean', 'stdev', 'count']]
    stats_df = pd.concat([stats_df, row_df])

stats_df

Empty DataFrame
Columns: [store_id, total_products_sold, mean, stdev, count]
Index: []


Unnamed: 0,store_id,total_products_sold,mean,stdev,count
0,24576,1650.0,412.50,446.17,4
0,24577,390.0,97.50,85.84,4
0,26405,8700.0,1087.50,582.06,8
0,30724,3090.0,441.43,209.52,7
0,28673,11970.0,1710.00,253.55,7
0,33111,8490.0,849.00,621.96,10
0,30732,3000.0,500.00,357.35,6
0,30734,3810.0,544.29,453.40,7
0,50861,31950.0,2282.14,938.77,14
0,18448,3810.0,762.00,283.72,5


In [None]:
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

# plotting commands

plt.plot(monthly_series)
plt.show()

print(raw_csv.iloc[i][month_dict[key][0]:month_dict[key][-1]].describe())
print('\n\n')
break

In [None]:
## important step for analysis is to group the sales per office in a monthly group

class StoreSalesMonth():
    
    def __init__(self):
        
        # timestamp property
        self.timestamp_components_number = 0        
        self.timestamp_component_sales_array = []
        
        # date property
        self.current_month = 0
        self.current_date_string = ''
        self.current_date_datetime = None
        
    def TotalMonthlySales(self):
        return sum(self.timestamp_component_sales_array)
        

store_sales_array = []
        

# first get the column names
atomic_timestamps = list(raw_csv)

current_month = None
timestamp_count

# instanciate StoreSalesMonth object
store_monthly_sales_obj = StoreSalesMonth()

# currently working with a single store (the first one!)
for idx, raw_date in enumerate(atomic_timestamps[0]):

    space_idx = raw_date.find(' ')
        
    store_monthly_sales_obj.current_date_string = raw_date[:space_idx]
    store_monthly_sales_obj.current_date_datetime = datetime.strptime(store_monthly_sales_obj.current_date, '%m/%d/%y')
    store_monthly_sales_obj.current_month = store_monthly_sales_obj.current_date_datetime.month
    
    if current_month == None:
        current_month = store_monthly_sales_obj.current_month
        store_monthly_sales_obj.timestamp_components_number += 1
        store_monthly_sales_obj.timestamp_component_sales_array.append(raw_csv[raw_date][idx])
        continue
        
    else:
        current_date_string = raw_date[:space_idx]
        current_month = datetime.strptime(current_date_string, '%m/%d/%y').month
        
        if current_month
        
        

In [None]:
# !which python
# !pip install pandas
# !pip install scipy
# !pip install NumPy
# !pip install Matplotlib
# !pip install scikit-learn
# !pip install statsmodels

In [None]:
import pandas as pd
raw_csv = pd.read_csv('sales_granular.csv', index_col=0)

In [None]:
raw_csv[:25]
raw_csv.count()
raw_csv.index

In [None]:
# 11936 columns 
# 906 rows (stores)

In [None]:
# has 546 elements in json
len(raw)

In [None]:
raw[1].keys()

In [None]:
print(len(list(raw_csv.index)))
print(len(tuple(raw_csv.index)))

In [None]:
raw[0].keys()

In [None]:
raw[2]['surroundings']['beauty_salon'][3].keys()[0].encode('utf-8')

In [None]:
with open('extract.json', 'w') as outfile:
    json.dump(raw[0], outfile)

In [None]:
for surr_type in raw[2]['surroundings'].keys():
    print(surr_type.encode('utf8'))

In [None]:
train = [1,2,3,4,5]
history = [x for x in train]
history[-1]


In [None]:
len(list(raw_csv))

In [None]:
[x[0] for x in list(raw_csv)]
obs_counter = 0
for _, entries_per_month in enumerate(list(raw_csv)):
    if entries_per_month[0] == '3' and entries_per_month[5] == '6' :
        obs_counter += 1
        print(entries_per_month)

obs_counter



In [None]:
space_idx = list(raw_csv)[100].find(' ')
date = list(raw_csv)[100][:space_idx]
date

In [None]:
!pip install timestring
import timestring

In [None]:
timestring.Date('25/12/2015')
import time


In [None]:
!pip install datetime
import datetime

In [None]:
from datetime import datetime
datetime.strptime('12/25/15', '%m/%d/%y')