In [None]:
""" 
1- Read data into a dataframe (mma)
2- Separate data into train/test (80/20) (mma_train , mma_test)
3- Calculate frequency in train data | test data of each product over all orders (mma_train_freq | mma_test_freq)
4- Create a solution table with columns (ProductID|SubstituteID|PFrequency|DepartmentID|AisleID|Selected?)
5- Populate the solution table ProductID column with a {SELECT DISTINCT ProductID} over original data
6- Add information of product frequency on orders per ProductID to solution table (PFrequency), by joining with mma-train-freq
7- Add DepartmentID|AisleID to solution table by joining with main table
8- Create SubstituteIDs for each ProductID
    >> For each AisleID|DepartmentID group:
        1)Select all ProductIDs | ProductNames under Aisle|Department
        2)Do a similarity analysis, and try to cluster similar product names
        3)On each cluster, compare frequency of product in orders, and select the most frequent product as SubstituteID
            Left Join with frequency table, and compare
        4)For outliers (products that are not similar to any other product), copy ProductID as SubstituteID
        5)Append ProductIDs|SubstituteIDs to Substitute table
9- Add SubstituteIDs to solution table by joining with Substitute table
10- Prepare objective function as a method
11- Prepare constraints as a method
12- Configure optimization with Objective, Constraints and Solution Table
13- Run optimization
14- Export solution table as csv
15- Test solution table with test data
16- Calculate score against metrics
"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import sklearn as sk
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import math
import random
import itertools
import time
import pulp
import os
import warnings
warnings.filterwarnings('ignore')


### 1- Read data into a dataframe (mma)

In [6]:
# Reasd the data
mma = pd.read_csv('data/mma_mart.csv')
mma.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods


In [8]:
# Check lenght of order_ids
mma.nunique()

order_id         97833
product_id       35070
product_name     35070
aisle_id           134
aisle              134
department_id       21
department          21
dtype: int64

### 2- Separate data into Train/Test

In [10]:
mma_train = mma[mma['order_id'] <= 80000]
mma_test  = mma[mma['order_id'] > 80000]

### 3- Calculate frequency in train data | test data of each product over all orders (mma-train-freq | mma-test-freq)

In [12]:
# Count frequency of products in train and test data
f_counts_train = mma_train['product_id'].value_counts()
f_counts_test = mma_test['product_id'].value_counts()

In [13]:
# Create dataframes for frequency counts

## Train   
f_counts_train_df = f_counts_train.to_frame().reset_index()
f_counts_train_df.columns = ['product_id', 'frequency']
## Test
f_counts_test_df = f_counts_test.to_frame().reset_index()
f_counts_test_df.columns = ['product_id', 'frequency']

In [None]:
mma_train_freq = f_counts_train_df.sort_values(by=['frequency'], ascending=False)
mma_test_freq = f_counts_test_df.sort_values(by=['frequency'], ascending=False)