In [39]:
import numpy as np
import pandas as pd
import xgboost
from statsmodels.tsa.ar_model import AR
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
%matplotlib inline
from sklearn.neighbors import NearestNeighbors

In [4]:
def preprocess_data(df):
    df['Agency_no'] = df.Agency.apply(lambda x:int(x.split('_')[1]))
    df['SKU_no'] = df.SKU.apply(lambda x:int(x.split('_')[1]))
    df['Year'] = df.YearMonth.apply(lambda x:int(x/100))
    df['Month'] = df.YearMonth.apply(lambda x:int(x%100))
    df['Datetime'] = df.YearMonth.apply(lambda x: datetime(int(x/100),int(x%100),1))
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    return df

In [8]:
df = pd.read_csv('integrated_train.csv')
df = preprocess_data(df)
df = df.sort_values(['YearMonth','SKU','Agency'])

In [40]:
# Year on year, for the same month, do the best products change - 3/4 times they dont
same = 0
different = 0
for agency in df.Agency_no.unique():
    try:
        for yearmonth in range(201601,201602):
            profits=[]
            for sku in df.SKU_no.unique():
                rec = df[(df.YearMonth==yearmonth)&(df.Agency_no==agency)&(df.SKU_no==sku)]
                if len(rec)==0:
                    continue
                profit = rec.Volume.values[0]*rec.Sales.values[0]
                profits.append({'sku':sku,'profit':profit})
            profits.sort(key=lambda x:x['profit'],reverse=True)
            best_16_1 = profits[0]['sku']
            best_16_2 = profits[1]['sku']
        for yearmonth in range(201701,201702):
            profits=[]
            for sku in df.SKU_no.unique():
                rec = df[(df.YearMonth==yearmonth)&(df.Agency_no==agency)&(df.SKU_no==sku)]
                if len(rec)==0:
                    continue
                profit = rec.Volume.values[0]*rec.Sales.values[0]
                profits.append({'sku':sku,'profit':profit})
            profits.sort(key=lambda x:x['profit'],reverse=True)
            best_17_1 = profits[0]['sku']
            best_17_2 = profits[1]['sku']
        if({best_16_1,best_16_2}!={best_17_1,best_17_2}):
            different += 1    
        else:
            same +=1 
    except:
        pass
print("Same",same,"Different",different)

Same 41 Different 13


In [48]:
dem = pd.read_csv('demographics.csv')
dem = dem.sort_values('Agency')
dem.reset_index(drop=True,inplace=True)
dem.head()

Unnamed: 0,Agency,Avg_Population_2017,Avg_Yearly_Household_Income_2017
0,Agency_01,153733,120207
1,Agency_02,3137874,240809
2,Agency_03,1538040,217280
3,Agency_04,50753,117957
4,Agency_05,3044268,182944


In [50]:
X = dem.loc[:,['Avg_Population_2017','Avg_Yearly_Household_Income_2017']].values

In [64]:
nbrs = NearestNeighbors(n_neighbors=5).fit(X)
distances,indices = nbrs.kneighbors(X)

In [65]:
print(indices[5],indices[13])

[ 5 13 59 54 49] [13  5 54 49 59]


In [66]:
print(distances[5],distances[13])

[     0.          39066.31184281  77968.9600418   79256.0938288
  95904.0198584 ] [      0.           39066.31184281   58829.97008328   61785.76692087
  107268.9173293 ]


The group that is closest to Agency_6 and Agency_14 are Agency_55,Agency_60,Agency_50

In [89]:
for agency in [55,60,50]:
    yearmonth=201701
    profits=[]
    for sku in df.SKU_no.unique():
        rec = df[(df.YearMonth==yearmonth)&(df.Agency_no==agency)&(df.SKU_no==sku)]
        if len(rec)==0:
            continue
        profit = rec.Volume.values[0]*rec.Sales.values[0]
        profits.append({'sku':sku,'profit':profit})
    #print(profits)
    profits.sort(key=lambda x:x['profit'],reverse=True)
    print("Agency ",agency,profits[0]['sku'],profits[1]['sku'],profits[2]['sku'])    
    print("Agency ",agency,profits[0]['profit'],profits[1]['profit'],profits[2]['profit'])

Agency  55 3 4 1
Agency  55 7734846.72332 3708007.95671 3398311.91474
Agency  60 1 2 5
Agency  60 10799274.3156 5205433.17372 2340306.77764
Agency  50 2 5 4
Agency  50 10634622.5648 4255073.05027 3605110.7062


In [87]:
for agency in [55,60,50]:
    yearmonth=201601
    profits=[]
    for sku in df.SKU_no.unique():
        rec = df[(df.YearMonth==yearmonth)&(df.Agency_no==agency)&(df.SKU_no==sku)]
        if len(rec)==0:
            continue
        profit = rec.Volume.values[0]*rec.Sales.values[0]
        profits.append({'sku':sku,'profit':profit})
    profits.sort(key=lambda x:x['profit'],reverse=True)
    print("Agency ",agency,profits[0]['sku'],profits[1]['sku'],profits[2]['sku'])
    print("Agency ",agency,profits[0]['profit'],profits[1]['profit'],profits[2]['profit'])

Agency  55 3 1 2
Agency  55 5685646.98469 4390884.9037 2473310.99321
Agency  60 1 2 4
Agency  60 10839939.0775 4904319.33475 3083362.09603
Agency  50 2 5 4
Agency  50 10220413.5336 4209387.90349 3658375.14033


In [82]:
df.SKU_no.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8, 11, 12, 18, 22, 31, 32, 34, 14, 15,
       20, 23, 17, 24, 27, 26, 21, 28])

In [91]:
nbrs = NearestNeighbors(n_neighbors=3,algorithm='kd_tree').fit(X)
distances,indices = nbrs.kneighbors(X)
print(indices[5],indices[13])
print(distances[5],distances[13])

[ 5 13 59] [13  5 54]
[     0.          39066.31184281  77968.9600418 ] [     0.          39066.31184281  58829.97008328]


In [92]:
for agency in [55,60,50]:
    yearmonth=201501
    profits=[]
    for sku in df.SKU_no.unique():
        rec = df[(df.YearMonth==yearmonth)&(df.Agency_no==agency)&(df.SKU_no==sku)]
        if len(rec)==0:
            continue
        profit = rec.Volume.values[0]*rec.Sales.values[0]
        profits.append({'sku':sku,'profit':profit})
    profits.sort(key=lambda x:x['profit'],reverse=True)
    print("Agency ",agency,profits[0]['sku'],profits[1]['sku'],profits[2]['sku'])
    print("Agency ",agency,profits[0]['profit'],profits[1]['profit'],profits[2]['profit'])

Agency  55 3 1 4
Agency  55 4579098.92409 3204261.21747 2202242.93078
Agency  60 1 2 4
Agency  60 12722755.6557 4442398.22346 3261442.52947
Agency  50 2 5 4
Agency  50 10521094.2489 4765179.39834 3598858.0518


In [93]:
for agency in [55,60,50]:
    yearmonth=201401
    profits=[]
    for sku in df.SKU_no.unique():
        rec = df[(df.YearMonth==yearmonth)&(df.Agency_no==agency)&(df.SKU_no==sku)]
        if len(rec)==0:
            continue
        profit = rec.Volume.values[0]*rec.Sales.values[0]
        profits.append({'sku':sku,'profit':profit})
    profits.sort(key=lambda x:x['profit'],reverse=True)
    print("Agency ",agency,profits[0]['sku'],profits[1]['sku'],profits[2]['sku'])
    print("Agency ",agency,profits[0]['profit'],profits[1]['profit'],profits[2]['profit'])

Agency  55 1 3 4
Agency  55 5731518.04224 5232174.12337 5122473.76959
Agency  60 1 4 2
Agency  60 10794965.9834 3338277.07793 3229782.29273
Agency  50 2 5 4
Agency  50 11011542.9323 5238773.97497 3649089.08549


In [94]:
for agency in [55,60,50]:
    yearmonth=201501
    profits=[]
    for sku in df.SKU_no.unique():
        rec = df[(df.YearMonth==yearmonth)&(df.Agency_no==agency)&(df.SKU_no==sku)]
        if len(rec)==0:
            continue
        profit = rec.Volume.values[0]*rec.Sales.values[0]
        profits.append({'sku':sku,'profit':profit})
    profits.sort(key=lambda x:x['profit'],reverse=True)
    print("Agency ",agency,profits[0]['sku'],profits[1]['sku'],profits[2]['sku'])
    print("Agency ",agency,profits[0]['profit'],profits[1]['profit'],profits[2]['profit'])

Agency  55 3 1 4
Agency  55 4579098.92409 3204261.21747 2202242.93078
Agency  60 1 2 4
Agency  60 12722755.6557 4442398.22346 3261442.52947
Agency  50 2 5 4
Agency  50 10521094.2489 4765179.39834 3598858.0518
