In [1]:
import os
import glob
import re
import pandas as pd

In [2]:
# Define input folder for distribution master files
path = r"D:\footwear_retail_chain_project\0. input_data\distribution\raw_data"

# Collect and combine all distribution master files across years
all_dfs = []
for file in glob.glob(os.path.join(path, "distribution_master_*.xlsx")):
    # Extract year from filename (e.g., distribution_master_2021.xlsx → "2021")
    year_match = re.search(r"(\d{4})", os.path.basename(file))
    year = year_match.group(1) if year_match else None

    # Read data from sheet "1.Data dữ liệu" (skipping first row for headers)
    df_temp = pd.read_excel(file, sheet_name="1.Data dữ liệu", skiprows=1)

    # Add column indicating which year this file applies to
    df_temp["applied_year"] = year

    # Collect dataframe
    all_dfs.append(df_temp)

# Concatenate all years into one DataFrame
df = pd.concat(all_dfs, ignore_index=True)

# Quick check of final dataframe structure
df.info()

  warn(msg)
  warn(msg)
  warn(msg)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13275 entries, 0 to 13274
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Custmer code    13275 non-null  object
 1   Site store      13275 non-null  int64 
 2   B2B/B2C         13275 non-null  object
 3   Channel         13275 non-null  object
 4   Region          13275 non-null  object
 5   City Level      13275 non-null  object
 6   Store Concept   13275 non-null  object
 7   Store Level     1534 non-null   object
 8   Trade term      13275 non-null  object
 9   Area group      1692 non-null   object
 10  Store type      13275 non-null  object
 11  Urbanization    13275 non-null  object
 12  Customer Name   13274 non-null  object
 13  Branch          13275 non-null  object
 14  Address 1       13231 non-null  object
 15  Address 2       13270 non-null  object
 16  Address 3       13270 non-null  object
 17  Show room area  1713 non-null   object
 18  Wareho

  warn(msg)


In [3]:
import pandas as pd

# Drop unnecessary columns
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]  # remove auto-generated "Unnamed" cols
df = df.drop(columns=[
    "Address 1", "Address 2", "Address 3",
    "Tháng mở", "Năm mở", "Tháng đóng", "Note"
])

# Standardize column names
df = df.rename(columns={
    "Custmer code": "customer_code",
    "Site store": "store_code",
    "B2B/B2C": "b2b_b2c",
    "Channel": "channel",
    "Region": "region",
    "City Level": "city_level",
    "Store Concept": "store_concept",
    "Store Level": "store_level",
    "Trade term": "trade_term",
    "Area group": "area_group",
    "Store type": "store_type",
    "Urbanization": "urbanization",
    "Customer Name": "customer_name",
    "Branch": "branch",
    "Show room area": "showroom_area",
    "Warehouse area": "warehouse_area",
    "Năm đóng": "close_year"
})

# Define value mappings for standardization
mappings = {
    "b2b_b2c": {"B2B": 0, "B2C": 1},
    "channel": {
        "CHTT": "retail_store", "ONLINE": "online",
        "TGPP": "distribution_partner", "ST": "supermarket"
    },
    "region": {
        "KVMN": "southern_area", "KVMB": "northern_area",
        "KVMT": "western_area", "KVMTR": "central_area",
        "KVTN": "highland_area", "KVCA": "cambodia_area",
        "Trung Quốc": "china_area", "Khác": "other"
    },
    "city_level": {
        "Cấp 1": "level_1", "Cấp 2": "level_2",
        "Cấp TW": "central_level", "Còn lại": "other",
        "Khác": "other", "": "other"
    },
    "store_concept": {
        "Biti's": "regular_store", "Biti’s": "regular_store",
        "Biti’s Hunter": "specialized_product_store",
        "Biti’s Premium": "premium_store", "BITI'S WEB": "online_store",
        "CHC": "standard_store", "CHCD": "specialized_business_store",
        "DL": "agency", "Outlet": "outlet", "Siêu thị": "supermarket",
        "TMDT": "ecommerce_platform", "Khác": "other", "": "other"
    },
    "store_level": {
        "TIÊU BIỂU": "best_store", "A+": "A+", "A": "A", "B": "B", "C": "C",
        "N": "new_store", "Closed": "other", "NA": "other", "NA ": "other"
    },
    "trade_term": {
        "Drop-ship": "drop_ship",
        "Mua đứt bán đoạn": "buy_and_sell",
        "Ký gửi": "consignment"
    },
    "area_group": {
        "<= 50 m2": "<=50m2", "50 m2 < và <= 100 m2": "50-100m2",
        "100 m2 < và <= 150 m2": "100-150m2", "150 m2 < và <= 200 m2": "150-200m2",
        "> 200 m2": ">200m2", "Closed": "other", "NA": "other", "NA ": "other"
    },
    "store_type": {
        "Siêu thị": "supermarket", "Độc lâp": "independent_store",
        "Độc lập": "independent_store", "Doc lap": "independent_store",
        "Store in Store": "store_in_store", "Store In Store": "store_in_store",
        "Online": "online", "Trong mall": "shopping_mall",
        "Pop - up": "pop_up", "POP-UP": "pop_up",
        "Khác": "other", "": "other"
    },
    "urbanization": {
        "TT hành chính kinh tế": "downtown", "Nông thôn": "rural",
        "Nội thành": "urban", "Khác": "other", "": "other"
    },
    "branch": {
        "CNMB": "northern_branch", "CNMTR-TN": "central_highland_branch",
        "CNMN": "southern_branch", "CNMT": "western_branch",
        "CNTP": "pioneer_branch", "CNLC": "lao_cai_branch",
        "CNCA": "cambodia_branch"
    },
    "showroom_area": {"Closed": 0, "NA": 0},
    "warehouse_area": {"Closed": 0, "NA": 0},
}

# Apply value mappings
for col, mapping in mappings.items():
    # Numeric replacements (0, 1, etc.) → use replace
    if all(isinstance(v, (int, float)) for v in mapping.values()):
        df[col] = df[col].replace(mapping)
    else:  # String/category replacements → use map
        df[col] = df[col].map(mapping)

# Handle missing values
fill_defaults = {
    "showroom_area": 0,
    "warehouse_area": 0,
    "area_group": "other",
    "store_level": "other"
}
df = df.fillna(value=fill_defaults)

# Create active store flag
# If "close_year" has a value → store is closed (0), otherwise active (1)
df["active_store"] = df["close_year"].notna().map({True: 0, False: 1})

# Drop redundant columns
df = df.drop(columns=["close_year"])

# Final check of dataframe structure
df.info()
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13275 entries, 0 to 13274
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_code   13275 non-null  object
 1   store_code      13275 non-null  int64 
 2   b2b_b2c         13275 non-null  int64 
 3   channel         13275 non-null  object
 4   region          13275 non-null  object
 5   city_level      13275 non-null  object
 6   store_concept   13275 non-null  object
 7   store_level     13275 non-null  object
 8   trade_term      13275 non-null  object
 9   area_group      13275 non-null  object
 10  store_type      13275 non-null  object
 11  urbanization    13275 non-null  object
 12  customer_name   13274 non-null  object
 13  branch          13275 non-null  object
 14  showroom_area   13275 non-null  object
 15  warehouse_area  13275 non-null  object
 16  applied_year    13275 non-null  object
 17  active_store    13275 non-null  int64 
dtypes: int

  df[col] = df[col].replace(mapping)


Unnamed: 0,customer_code,store_code,b2b_b2c,channel,region,city_level,store_concept,store_level,trade_term,area_group,store_type,urbanization,customer_name,branch,showroom_area,warehouse_area,applied_year,active_store
0,60000003,60000003,0,supermarket,southern_area,central_level,supermarket,other,buy_and_sell,other,supermarket,urban,BIGC AN LẠC,southern_branch,0,0,2022,1
1,60000006,60000006,0,supermarket,southern_area,level_2,supermarket,other,buy_and_sell,other,supermarket,urban,BIGC ĐỒNG NAI,southern_branch,0,0,2022,1
2,60000007,60000007,0,supermarket,southern_area,central_level,supermarket,other,buy_and_sell,other,supermarket,urban,BIGC MIỀN ĐÔNG,southern_branch,0,0,2022,1
3,60000008,60000008,0,supermarket,southern_area,central_level,supermarket,other,buy_and_sell,other,supermarket,urban,BIGC GÒ VẤP,southern_branch,0,0,2022,1
4,60000534,60000534,0,supermarket,southern_area,central_level,supermarket,other,buy_and_sell,other,supermarket,urban,CO.OP NGUYỄN KIỆM,southern_branch,0,0,2022,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13270,60004078,60004078,0,distribution_partner,southern_area,central_level,other,other,buy_and_sell,other,independent_store,urban,CTY CP DV VĂN PHÒNG PHƯƠNG ĐÔNG,southern_branch,0,0,2025,0
13271,20000240,20000240,0,online,cambodia_area,level_1,online_store,other,drop_ship,other,online,downtown,BÁN HÀNG ONLINE CAMBO,cambodia_branch,0,0,2025,0
13272,60004079,60004077,0,distribution_partner,southern_area,central_level,other,other,buy_and_sell,other,independent_store,urban,Yaaqob Saleh USA,southern_branch,0,0,2025,0
13273,60004080,60004078,0,distribution_partner,southern_area,central_level,other,other,buy_and_sell,other,independent_store,urban,CTY CHỊ THẢO NGUYỄN,southern_branch,0,0,2025,0


In [None]:
# Export cleaned df to CSV
output_path = r"D:\footwear_retail_chain_project\0. input_data\distribution\processed_data"
os.makedirs(output_path, exist_ok=True)
output_file = os.path.join(output_path, "distribution_master.csv")
df.to_csv(output_file, index=False)