In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error
import joblib

## Load datasets

In [5]:
customers = pd.read_csv('../data/olist_customers_dataset.csv')
items = pd.read_csv('../data/olist_order_items_dataset.csv')
payments = pd.read_csv('../data/olist_order_payments_dataset.csv')
reviews = pd.read_csv('../data/olist_order_reviews_dataset.csv', parse_dates=['review_creation_date', 'review_answer_timestamp'])
orders = pd.read_csv('../data/olist_orders_dataset.csv', parse_dates=['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'])
products = pd.read_csv('../data/olist_products_dataset.csv')
sellers = pd.read_csv('../data/olist_sellers_dataset.csv')
categories = pd.read_csv('../data/product_category_name_translation.csv')

## Join datasets

In [None]:
merged = pd.merge(left=items, right=payments, on='order_id', how='inner')
merged = pd.merge(left=merged, right=orders, on='order_id', how='inner')
merged = pd.merge(left=merged, right=reviews, on='order_id', how='inner')
merged = pd.merge(left=merged, right=customers, on='customer_id', how='inner')
merged = pd.merge(left=merged, right=products, on='product_id', how='inner')
merged = pd.merge(left=merged, right=sellers, on='seller_id', how='inner')
merged = pd.merge(left=merged, right=categories, on='product_category_name', how='inner')


In [None]:
merged['delivery_time_days'] = (merged['order_delivered_customer_date'] - merged['order_purchase_timestamp']).dt.days

## Train Random Forest Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

joblib.dump(model, '../models/delivery_time_model.joblib')