In [1]:
import sys
from pathlib import Path

sys.path.append(str(Path().resolve().parent)) # Katalog główny repo do sys.path

# 1. Data load

In [2]:
from src.data_loading import load_data, merge_data

transactions_df, customers_df, articles_df = load_data()
df = merge_data(transactions_df, articles_df)


Data path: data\raw


In [3]:
from src.utils import describe_columns
describe_columns(df)

Unnamed: 0,Total,Uniques,Missing,DataType,MostFrequent,LeastFrequent
t_dat,31788324,734,0,object,2019-09-28,2020-01-01
customer_id,31788324,1362281,0,object,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,e74dcc41a6ecb3d69478b05b50fb5f91d4ebd2b5a9ba89...
article_id,31788324,104547,0,int64,706016001,873278006
price,31788324,9857,0,float64,0.016932,0.25061
sales_channel_id,31788324,2,0,int64,2,1
product_code,31788324,46834,0,int64,706016,942035
prod_name,31788324,45508,0,object,Jade HW Skinny Denim TRS,Delila WL boot
product_type_no,31788324,131,0,int64,272,483
product_type_name,31788324,130,0,object,Trousers,Clothing mist
product_group_name,31788324,19,0,object,Garment Upper body,Fun


In [4]:
from src.utils import describe_columns
describe_columns(customers_df)

Unnamed: 0,Total,Uniques,Missing,DataType,MostFrequent,LeastFrequent
customer_id,1371980,1371980,0,object,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...
FN,476930,1,895050,float64,1.0,1.0
Active,464404,1,907576,float64,1.0,1.0
club_member_status,1365918,3,6062,object,ACTIVE,LEFT CLUB
fashion_news_frequency,1355969,3,16011,object,NONE,Monthly
age,1356119,84,15861,float64,21.0,94.0
postal_code,1371980,352899,0,object,2c29ae653a9282cce4151bd87643c907644e09541abc28...,980c99b13848ea53eaf6a70cbc07d7db4f64ed1739023b...


# 2. Preprocessing

In [5]:
from src.preprocessing import delete_columns

print(df.shape, customers_df.shape)
delete_columns(df, customers_df)
print(df.shape, customers_df.shape)


(31788324, 29) (1371980, 7)
(31788324, 18) (1371980, 7)


In [6]:
from src.preprocessing import optimize_dtypes

df = optimize_dtypes(df)
customers_df = optimize_dtypes(customers_df)

[optimize_dtypes] Memory reduced from 33299.69 MB to 3138.11 MB
[optimize_dtypes] Memory reduced from 470.60 MB to 252.36 MB


In [7]:
from src.preprocessing import filter_customers_with_min_purchase_days

print(df.shape, customers_df.shape)
df, customers_df = filter_customers_with_min_purchase_days(df, customers_df, min_days=4) # Tylko aktywni
print(df.shape, customers_df.shape)

(31788324, 18) (1371980, 7)
(27740711, 18) (617323, 7)


# 3. Feature Engineering

In [8]:
from src.feature_engineering import generate_customer_features

final_df = generate_customer_features(df, customers_df)


In [9]:
from src.utils import describe_columns
final_usage = final_df.memory_usage(deep=True).sum()
print(f"Memory usage  {final_usage / 1_048_576:.2f} MB")
print(final_df.shape)

Memory usage  116.03 MB
(617323, 46)


In [10]:
from src.utils import describe_columns
describe_columns(final_df)

Unnamed: 0,Total,Uniques,Missing,DataType,MostFrequent,LeastFrequent
customer_id,617323,617323,0,string[python],00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...
FN,260849,1,356474,category,1.0,1.0
Active,256542,1,360781,category,1.0,1.0
club_member_status,616387,3,936,category,ACTIVE,LEFT CLUB
fashion_news_frequency,613946,3,3377,category,NONE,Monthly
age,614056,84,3267,float16,24.0,97.0
num_baskets,617323,226,0,int16,4,230
total_spent,617323,182825,0,float32,177.847458,3612.72876
total_items,617323,744,0,int16,12,956
mean_price,617323,316536,0,float32,25.40678,17.531826


# 4. Data save

In [11]:
from pathlib import Path

ROOT_DIR = Path.cwd().parents[0]
DATA_DIR = ROOT_DIR / "data" / "processed"
DATA_DIR.mkdir(parents=True, exist_ok=True)

output_path = DATA_DIR / "final_df.csv"
final_df.to_csv(output_path, index=False)
