In [None]:
from google.colab import files
import pandas as pd
import io

uploaded = files.upload()

In [None]:
import matplotlib.pyplot as plt

metadata = pd.read_csv(io.BytesIO(uploaded['metadata_1.11.23.csv']))
time_series = pd.read_csv(io.BytesIO(uploaded['time_series_AB 20230111.csv']))
time_series = time_series.loc[time_series['Inventory Status'] != 'OOS or Low Inventory']
time_data = time_series[["week_start", "unit_sold", "asin","sales"]]
time_data.head()

In [None]:
# ID and Year Enumeration
asin_list = time_series['asin'].tolist()
unique_ids = [*set(asin_list)]
year_enum = {0: "2014", 1: '2015', 2: '2016', 3: '2017', 4: '2018', 5: '2019', 6: '2020', 7: '2021', 8: '2022', 9: '2023'}
year_enumR = dict(zip(year_enum.values(), year_enum.keys()))

year_list = []

for elem in time_data['week_start']:
  s = elem[-4:]
  year_list.append(year_enumR[s])

time_data = time_data.assign(year_index=year_list)
time_data.head()

In [None]:
# Drop 2023 data as that will skew our percentages
time_data = time_data.loc[time_data['year_index'] != 9]

In [None]:
# For each year, grab the max sale and units sold value for product
# Id -> (year -> max_price)
max_prices = dict(dict())

for id in unique_ids:
  data = time_data.loc[time_data['asin'] == id]
  year_prices = dict()
  z = 0
  for i, elem in data.iterrows():
    # Grab year
    year = elem['week_start'][-4:]
    if (not year_prices or year not in year_prices):
      year_prices[year] = elem['sales']
    else:
      year_prices[year] += elem['sales']
  max_prices[id] = year_prices

print(max_prices['B07NW4Y5BW'])

In [None]:
# Add percentage column
percent_list = []
for i, elem in time_data.iterrows():
  s = elem['sales']
  if (s == 0):
    cur_percent = 0
  else:
    cur_percent = s / max_prices[elem['asin']][elem['week_start'][-4:]]
  percent_list.append(cur_percent)

df2 = time_data.assign(year_percent=percent_list)
df2.head()

In [None]:
# strip years
df2['week_start'] = df2['week_start'].str[:-5]
df2['week_start'].head()

In [None]:
# Create data_avg dataframe
data_avg = []

df2.groupby(['week_start']).mean()
for id in unique_ids:
  data = df2.loc[df2['asin'] == id]
  #define index column
  data.set_index('week_start', inplace=True)
  cur_avg = data.groupby(['week_start']).mean()
  # cur_avg.plot(legend=True, subplots=True)
  cur_avg['asin'] = id
  if (len(cur_avg) != 0):
    data_avg.append(cur_avg)

# print(max_prices['B081YJPP7Y'])
# print(data_avg)
data_avg[0].head()

In [None]:
# ReIndex and Convert to Date
len(data_avg)
for i, elem in enumerate(data_avg):
  elem = elem.add_suffix('_avg').reset_index()
  elem['week_start'] = elem['week_start'].astype(str) + '/2020'
  elem['week_start'] = pd.to_datetime(elem['week_start'])
  elem = elem.sort_values('week_start')
  elem.set_index('week_start',inplace=True)
  data_avg[i] = elem


In [None]:
# Use product 11 for decorative spike

cur = data_avg[11]
cur.plot(legend=True,subplots=True)
cur.head()

In [None]:
# cur = cur.reset_index()
print(cur.info())
# cur.head() 

In [None]:
from scipy.signal import find_peaks, peak_widths

plotable = cur['year_percent_avg']
plotable.plot(x='week_start')
avg = plotable.mean()
median = plotable.median()
plt.axhline(y = avg, color = 'r', label = 'mean')
plt.axhline(y = median, color = 'g', label = 'median')
plt.show()
avg

In [None]:

plotable = cur['Seasonality']
plotable.plot(x='week_start')
avg = 1
plt.axhline(y = avg, color = 'r', label = 'mean')
plt.axhline(y = 1.25, color = 'g', label = '1.5')
plt.show()

In [None]:
# Visualization
# first_product = time_data.loc[time_data['asin'] == 'B01LXOVLEQ']

# for i in range(3,10):
#   # print(first_product['year_index'])
#   year_data = first_product.loc[first_product['year_index'] == i]
#   print(year_enum[i])
#   # print(year_data)
#   x = year_data['week_start'].to_numpy()
#   y = year_data['unit_sold'].to_numpy()
#   plt.plot(x,y)
#   plt.show()
#   plt.clf()
# first_product.head()

In [None]:
# Obtain distribution of good data points

distribution = [0] * len(unique_ids)
for i, id in enumerate(unique_ids):
  cur_data = time_data.loc[time_data['asin'] == id]
  distribution[i] = len(cur_data)

distribution

In [None]:
# Plot distribution
import numpy as np

counts, bins = np.histogram(distribution)
plt.hist(bins[:-1], bins, weights=counts)

In [None]:
# Confidence Score

# We set 150 as the number of datapoints required to be "confident"

# We then assign a confidence score c in (0,1) where c = min(data_points/150,1)

for i, product in enumerate(data_avg):
  id = product['asin_avg'].values[0]
  data_points = len(time_data.loc[time_data['asin'] == id])
  product['Confidence'] = min(1, data_points/150)
  data_avg[i] = product

In [None]:
# Seasonality Index

# 1 => Median, 0 => Lowest below Median, 2 => Highest Above Median

# Anything above 1.25 is considered in season

for i, product in enumerate(data_avg):
  cur = product['year_percent_avg']
  median = cur.median()
  least = cur.min()
  most = cur.max()
  largest_difference = max(abs(median-least), abs(median - most))
  product['Seasonality'] = product['year_percent_avg'].apply(lambda x: (x-median)/largest_difference + 1)
  data_avg[i] = product

In [None]:
# Weekly Indexing

# We adjust dates to start of the week

for i, product in enumerate(data_avg):
  temp = product.reset_index()
  temp['Week'] = temp['week_start'].dt.week
  temp.set_index('Week', inplace=True)
  data_avg[i] = temp

data_avg[0]

In [None]:
cur = data_avg[11]
cur.plot(legend=True,subplots=True)
cur.head()