# Feature Engineering

## Import Libraries and Files

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
aisle = pd.read_csv("/Users/yen/Desktop/instacart-reorder-prediction/data/aisles.csv")
department = pd.read_csv("/Users/yen/Desktop/instacart-reorder-prediction/data/departments.csv")
order = pd.read_csv("/Users/yen/Desktop/instacart-reorder-prediction/data/orders.csv")
product = pd.read_csv("/Users/yen/Desktop/instacart-reorder-prediction/data/products.csv")
prior_order = pd.read_csv("/Users/yen/Desktop/instacart-reorder-prediction/data/order_products__prior.csv")
train_order = pd.read_csv("/Users/yen/Desktop/instacart-reorder-prediction/data/order_products__train.csv")

## Problem Definition

Build a model for the prediction of a customer's willingness to buy a certain product
- Unit: each customer x product history (e.g. if user1 buy product A again)
- Target: 'reordered == 1' means the user had bought the product again; 'reordered == 0' means the user did not buy the product again

## Transcation History for Each Product

In [3]:
# total count of the product being bought
prod_orders = prior_order.groupby('product_id')['reordered'].agg(['count', 'sum']).reset_index()
prod_orders['reorder_rate'] = prod_orders['sum'] / prod_orders['count']

## Customer Behavioural Pattern

In [6]:
user_features = order.groupby('user_id').agg({
    'order_number': 'max'
}).rename(columns={'order_number': 'total_orders'})

## Customer x Product Preparation

In [8]:
products_merged = product.merge(aisle, on='aisle_id').merge(department, on='department_id')
order_products = prior_order.merge(products_merged, on='product_id')
order_data = order_products.merge(order[['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day']], on='order_id')

In [9]:
user_product = order_data.groupby(['user_id', 'product_id']).agg({
    'order_id': 'count',
    'reordered': 'sum'
}).rename(columns={
    'order_id': 'up_order_count',
    'reordered': 'up_reorder_count'
}).reset_index()

user_product['reorder_rate'] = user_product['up_reorder_count'] / user_product['up_order_count']

## Dataset for Training

In [12]:
# find the last order of each customer (as the training data)
last_orders = order[order['eval_set'] == 'train'][['user_id', 'order_id']]
train_data = pd.merge(last_orders, order_data, on='user_id')

## Combine the features

In [13]:
train_df = train_data.merge(prod_orders, on='product_id', how='left')
train_df = train_df.merge(user_features, on='user_id', how='left')
train_df = train_df.merge(user_product, on=['user_id', 'product_id'], how='left')

In [15]:
train_df.to_csv("/Users/yen/Desktop/instacart-reorder-prediction/notebooks/train_df.csv", index=False)