<a href="https://colab.research.google.com/github/tnfru/colab_research/blob/master/PricePred_vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from tensorflow.nn import softmax
from copy import deepcopy, copy
import time

In [0]:
data = pd.read_excel('https://filebin.net/d71dl6ly0ucp7zwg/Beispiel_-_Superstore.xls?t=bj5559pa')

In [0]:
df = data.drop(['Kundenname'], axis=1)

In [0]:
def encode(df, column):
  df[column] = df[column].astype('category')
  df[column + "_cat"] = df[column].cat.codes

In [0]:
def encode_labels(df):
  labels_to_encode = ["Auftrags-ID", "Kunden-ID", "Bestelldatum", "Produkt-ID"]
  df = df.sort_values(by=["Bestelldatum"])

  for label in labels_to_encode:
    encode(df, label)

  labels_to_encode = ["Land/Region", "Bundesland", "Stadt"]
  df = df.sort_values(by=labels_to_encode)

  for label in labels_to_encode:
    encode(df, label)

  df = df.drop(["Auftrags-ID", "Kunden-ID", "Bestelldatum",
             "Land/Region", "Bundesland", "Stadt", "Versanddatum", 
             "Produkt-ID", "Zeilen-ID"], axis=1)
  return df

In [0]:
def get_separated_entities(df):
  df = encode_labels(df)

  items = df.drop(columns=["Kunden-ID_cat", "Bestelldatum_cat", "Auftrags-ID_cat", 
                "Land/Region_cat", "Bundesland_cat", "Stadt_cat", "Region", 
                "Segment", "Versandmodus"], axis=1)
  items = pd.get_dummies(items, columns=["Kategorie" , "Unterkategorie"])
  


  transactions = pd.get_dummies(df, columns=["Segment", "Versandmodus", "Region", "Kategorie"]).drop("Produktname", axis=1)

  customers = df.drop(columns=["Versandmodus", "Produktname", "Kategorie", "Gewinn",
                             "Bestelldatum_cat", "Produkt-ID_cat", "Kunden-ID_cat", "Unterkategorie"], axis=1)
  customers = pd.get_dummies(customers, columns=["Segment", "Region"])

  return items, customers, transactions
  

In [0]:
def cosine_sim(a, b):
  return a @ b / (np.linalg.norm(a) * np.linalg.norm(b))

In [0]:
def create_CPM(data):
  cpm_shape = data["Kunden-ID"].nunique(), data["Produkt-ID"].nunique()
  CPM = np.zeros(cpm_shape)
  cpm_inp = data.drop(columns=["Auftrags-ID", "Zeilen-ID", "Versanddatum", "Bestelldatum", 
                   "Versandmodus", "Kundenname", "Segment", "Stadt", "Bundesland", 
                   "Land/Region", "Region", "Kategorie", "Unterkategorie", "Umsatz",
                   "Menge", "Rabatt", "Gewinn", "Produktname"], axis=1)
  encode(cpm_inp, "Kunden-ID")
  encode(cpm_inp, "Produkt-ID")
  cpm_inp = cpm_inp.drop(["Kunden-ID", "Produkt-ID"], axis=1).values

  for i in range(len(cpm_inp)):
    cust_id = cpm_inp[i][0]
    prod_id = cpm_inp[i][1]

    CPM[cust_id][prod_id] = 1
  
  return CPM

In [0]:
def create_PPM(items, print_progress=False):
  ppm_shape = items["Produkt-ID_cat"].nunique(), items["Produkt-ID_cat"].nunique()
  PPM = np.zeros(ppm_shape)
  
  ppm_inp = items.drop(["Produktname", "Gewinn", "Rabatt"], axis=1)
  ppm_inp["Umsatz"] = ppm_inp["Umsatz"].values / ppm_inp["Menge"].values
  ppm_inp = ppm_inp.rename(columns={"Umsatz" : "Preis"}).drop(["Menge"], axis=1)
  ppm_inp['Preis'] = (ppm_inp['Preis'].values - ppm_inp['Preis'].values.mean()) / ppm_inp['Preis'].values.std()

  ppm_inp = ppm_inp.groupby(["Produkt-ID_cat"]).sum()
  number_of_entries = ppm_inp[["Kategorie_Bürobedarf", "Kategorie_Möbel", "Kategorie_Technik"]].values.sum(axis=1)
  ppm_inp = ppm_inp.divide(number_of_entries, axis=0)
  ppm_inp = ppm_inp.values

  n = ppm_shape[0]

  for i in range(n):
    for j in range(i + 1, n):
      PPM[i][j] = cosine_sim(ppm_inp[i], ppm_inp[j])

    if print_progress and i % 100 == 0:
      print(round(i / n, 2) * 100 , "% done")

  return PPM

In [0]:
def ppm_recommendations(PPM):
  return argmax(PPM, axis=1)

In [0]:
def get_item_prior(epsilon=1e-3):
  #w'keit dass item von beliebigem nutzer gekauft wird
  return (CPM.sum(axis=0) + epsilon) / (num_users + 2 * epsilon) 

def get_user_prior(epsilon=1e-3):
  #w'keit dass nutzer ein beliebiges item kauft
  return (CPM.sum(axis=1) + epsilon) / (num_items + 2 * epsilon)

## Create CPM and set constants

In [0]:
items, customers, transactions = get_separated_entities(df)

In [0]:
CPM = create_CPM(data)
# PPM = create_PPM(items)

num_users = CPM.shape[0]
num_items = CPM.shape[1]
item_prior = get_item_prior()
user_prior = get_user_prior()

In [0]:
def user_based(u, i,epsilon=1e-3):
  k = CPM[u]
  K = np.tile(k, num_users).reshape(num_users, num_items)
  I = np.tile(CPM[:,i], num_items).reshape(num_items, num_users)

  cond = I * (CPM == K).T 
  c = (cond.sum(axis=-1) + epsilon) / (CPM[:,i].sum() + 2 * epsilon)
  
  return (item_prior[i] * np.prod(c)) ** (1/ (1 + num_items))

In [0]:
def item_based(u, i, epsilon=1e-3):
  k = CPM[:,i]
  K = np.tile(k, num_items).reshape(num_items, num_users).T
  U = np.tile(CPM[u], num_users).reshape(num_users, num_items)
  
  cond = U * (CPM == K)
  c = (cond.sum(axis=-1) + epsilon) / (CPM[u].sum() + 2 * epsilon)

  return (user_prior[u] * np.prod(c)) ** (1/ (1 + num_users))

In [0]:
def hybrid_score(u, i):
  return user_based(u, i) * item_based(u, i)

In [0]:
def recommend(u=None, i=None, print_progress=False):
  recs = []

  if u is not None:
    print("Iterating Items")
    for i in range(num_items):
      recs.append(hybrid_score(u, i))

      if i % 100 == 0 and print_progress:
        print(round(i/num_items * 100, 2), "%")

  elif i is not None:
    print("Iterating Users")

    for u in range(num_users):
      recs.append(hybrid_score(u, i))

      if u % 100 == 0 and print_progress:
        print(round(u/num_users * 100, 2), "%")

  return recs

In [153]:
start = time.time()
foo = recommend(u=137)
end = time.time()
print(end - start)

Iterating Items
48.64574384689331


In [154]:
max(foo)

0.9544013739567365