In [13]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from pandas.tseries.offsets import DateOffset

In [14]:
df_customers = pd.read_csv("data/customers.csv")
df_order_items = pd.read_csv("data/order_items.csv")
df_pymts = pd.read_csv("data/order_pymts.csv")
df_reviews = pd.read_csv("data/order_reviews.csv")
df_products = pd.read_csv("data/products.csv")
df_sellers = pd.read_csv("data/sellers.csv")
df_orders = pd.read_csv("data/orders.csv")
df_geoloc = pd.read_csv("data/geoloc.csv")
df_translation = pd.read_csv("data/translation.csv")

In [15]:
import pandas as pd
import numpy as np

# --- 0. Add date to data frames ---
df_orders["order_purchase_timestamp"] = pd.to_datetime(df_orders["order_purchase_timestamp"])

df_customers = (df_customers
    .merge(df_orders[["order_purchase_timestamp", "customer_id"]], on="customer_id", how="left")
    )

df_reviews = (df_reviews
    .merge(df_orders[["order_purchase_timestamp", "order_id"]], on="order_id", how="left")
    )

date = df_orders["order_purchase_timestamp"].min() + DateOffset(months=12)
max_date = df_orders["order_purchase_timestamp"].max()

while date < max_date:
    print(date)
    
    # --- 1. Monetary ---
    df_monetary = (
        df_orders.loc[df_orders["order_purchase_timestamp"] < date]
        .merge(df_pymts[["order_id", "payment_value"]], on="order_id", how="left")
        .merge(df_customers[["customer_id", "customer_unique_id"]], on="customer_id", how="left")
        .groupby("customer_unique_id", as_index=False)["payment_value"].sum()
        .rename(columns={"payment_value": "Monetary"})
    )

    # --- 2. Recency ---

    df_recency = (
        df_orders.loc[df_orders["order_purchase_timestamp"] < date]
        .merge(df_customers[["customer_id", "customer_unique_id"]], on="customer_id", how="left")
        .groupby("customer_unique_id", as_index=False)["order_purchase_timestamp"].max()
        .rename(columns={"order_purchase_timestamp": "Recency"})
    )
    ref_date = df_recency["Recency"].max()
    df_recency["Recency"] = (ref_date - df_recency["Recency"]).dt.days

    # --- 3. Frequency ---
    df_frequency = (
        df_customers.loc[df_customers["order_purchase_timestamp"] < date]
        .groupby("customer_unique_id", as_index=False)["customer_id"].count()
        .rename(columns={"customer_id": "Frequency"})
    )

    # --- 4. Review Score ---
    df_review_score = (
        df_reviews[["order_id", "review_score"]].loc[df_reviews["order_purchase_timestamp"] < date]
        .groupby("order_id", as_index=False).mean()
        .merge(df_orders[["order_id", "customer_id"]], on="order_id", how="left")
        .merge(df_customers[["customer_id", "customer_unique_id"]], on="customer_id", how="left")
        .groupby("customer_unique_id", as_index=False)["review_score"].mean()
        .rename(columns={"review_score": "mean_review_score"})
    )

    # --- 5. Customer position & Distance ---
    df_lat_lng = (
        df_geoloc.groupby("geolocation_city", as_index=False)
        [["geolocation_lat", "geolocation_lng"]].first()
    )

    df_customers_pos = (
        df_customers.rename(columns={"customer_city": "geolocation_city"}).loc[df_customers["order_purchase_timestamp"] < date]
        .merge(df_lat_lng, on="geolocation_city", how="left")
        [["customer_id", "customer_unique_id", "geolocation_lat", "geolocation_lng"]]
    )

    df_sellers_pos = (
        df_sellers.rename(columns={"seller_city": "geolocation_city"})
        .merge(df_lat_lng, on="geolocation_city", how="left")
        [["seller_id", "geolocation_lat", "geolocation_lng"]]
        .dropna()
    )

    df_items_pos = (
        df_order_items
        .merge(df_sellers_pos, on="seller_id", how="left")
        [["order_id", "geolocation_lat", "geolocation_lng"]]
        .groupby("order_id", as_index=False).mean()
    )

    df_orders_pos = (
        df_orders[["order_id", "customer_id"]]
        .merge(df_items_pos, on="order_id", how="left")
    )

    df_customers_pos = (
        df_customers_pos
        .merge(df_orders_pos, on="customer_id", how="left", suffixes=("", "_order"))
    )

    df_customers_pos["distance"] = np.sqrt(
        (df_customers_pos["geolocation_lat"] - df_customers_pos["geolocation_lat_order"])**2 +
        (df_customers_pos["geolocation_lng"] - df_customers_pos["geolocation_lng_order"])**2
    )

    df_customers_pos = (
        df_customers_pos.groupby("customer_unique_id", as_index=False)["distance"].mean()
        .rename(columns={"distance": "mean_distance"})
    )

    # --- 6. Assemblage final ---
    df_rfm = (
        df_recency
        .merge(df_monetary, on="customer_unique_id", how="left")
        .merge(df_frequency, on="customer_unique_id", how="left")
        .merge(df_review_score, on="customer_unique_id", how="left")
        .merge(df_customers[["customer_unique_id", "customer_state"]], on="customer_unique_id", how="left")
        .merge(df_customers_pos, on="customer_unique_id", how="left")
        .drop_duplicates()
    )

    # Version finale
    df_customer_infos = df_rfm.drop(columns=["customer_state"]).dropna()
    print(df_customer_infos.shape)
    df_customer_infos.to_csv(f"data_temporel/customer_infos_{date.year}-{date.month}.csv", index=False)

     # Incrementation de la date
    date += DateOffset(months=1)


2017-09-04 21:15:19
(22270, 6)
2017-10-04 21:15:19
(26360, 6)
2017-11-04 21:15:19
(30536, 6)
2017-12-04 21:15:19
(38081, 6)
2018-01-04 21:15:19
(43159, 6)
2018-02-04 21:15:19
(50084, 6)
2018-03-04 21:15:19
(56541, 6)
2018-04-04 21:15:19
(63377, 6)
2018-05-04 21:15:19
(70059, 6)
2018-06-04 21:15:19
(76204, 6)
2018-07-04 21:15:19
(82133, 6)
2018-08-04 21:15:19
(88377, 6)
2018-09-04 21:15:19
(93391, 6)
2018-10-04 21:15:19
(93391, 6)


In [19]:
last_df = pd.read_csv("data_temporel/customer_infos_2018-10.csv")

In [24]:
df_customer_infos

Unnamed: 0,customer_unique_id,Recency,Monetary,Frequency,mean_review_score,mean_distance
0,0000366f3b9a7992bf8c76cfdf3221e2,146,141.90,1,5.0,1.021571
1,0000b849f77a49e4a4ce2b2a4ca5be3f,149,27.19,1,4.0,0.137285
2,0000f46a3911fa3c0805444483337064,571,86.22,1,3.0,7.730583
3,0000f6ccb0745a6a4b88665a16c9f078,355,43.62,1,4.0,17.270880
4,0004aac84e0df4da2b147fca70cf8255,322,196.89,1,5.0,1.437898
...,...,...,...,...,...,...
99435,fffcf5a5ff07b0908bd4e2dbc735a684,481,2067.42,1,5.0,18.224090
99436,fffea47cd6d3cc0a88bd621562a9d061,296,84.58,1,4.0,13.952046
99437,ffff371b4d645b6ecea244b27531430a,603,112.46,1,5.0,14.089330
99438,ffff5962728ec6157033ef9805bacc48,154,133.69,1,5.0,6.143806


In [25]:
last_df

Unnamed: 0,customer_unique_id,Recency,Monetary,Frequency,mean_review_score,mean_distance
0,0000366f3b9a7992bf8c76cfdf3221e2,146,141.90,1,5.0,1.021571
1,0000b849f77a49e4a4ce2b2a4ca5be3f,149,27.19,1,4.0,0.137285
2,0000f46a3911fa3c0805444483337064,571,86.22,1,3.0,7.730583
3,0000f6ccb0745a6a4b88665a16c9f078,355,43.62,1,4.0,17.270880
4,0004aac84e0df4da2b147fca70cf8255,322,196.89,1,5.0,1.437898
...,...,...,...,...,...,...
93386,fffcf5a5ff07b0908bd4e2dbc735a684,481,2067.42,1,5.0,18.224090
93387,fffea47cd6d3cc0a88bd621562a9d061,296,84.58,1,4.0,13.952046
93388,ffff371b4d645b6ecea244b27531430a,603,112.46,1,5.0,14.089330
93389,ffff5962728ec6157033ef9805bacc48,154,133.69,1,5.0,6.143806
