In [24]:
import pandas as pd
import os
import time

def ensure_file_available(file_path):
    if not os.path.exists(file_path):
        print("File is not available. Please check your file in one drive")
        while not os.path.exists(file_path):
            time.sleep(5)
    print("File is available. Proceeding the processes")

def convert_excel_date(df, columns):
    for column_name in columns:
        if pd.api.types.is_numeric_dtype(df[column_name]):
            df[column_name] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df[column_name], unit='D')
        else:
            df[column_name] = pd.to_datetime(df[column_name], errors='coerce')

def clean_PO_HBA_file():
    # Define source and save path
    source_file = r'C:\Users\Thanawit C\OneDrive - Sahamit Product Co.,Ltd\Data for Stock Report\COPY_PO HBA.xlsb'
    save_path = r'D:\Data for Stock Report\cleaned_PO_pending_HBA.xlsx'

    ensure_file_available(source_file)

    # Define the column names based on the actual headers in the Excel file
    column_mapping = {
        'A': 'CJ_Article', 
        'B': 'SHM_Article',
        'D': 'SHM PO Date',
        'E': 'SHM PO NO.',
        'G': 'CJ PO NO.',
        'J': 'Product Name',
        'L': 'Supplier',
        'M': 'Sold to',
        'N': 'สถานที่จัดส่งสินค้า',
        'O': 'หน่วยบรรจุ (ชิ้น/ลัง)',
        'U': 'จำนวนเปิด PO สหมิตร (ลัง)',
        'AI' : 'วันที่จัดส่งสินค้า',
        'AM': 'สถานะการจัดส่งสินค้า'
    }

    try:
        # Load the Excel file with specified sheet name and header row
        df = pd.read_excel(source_file, sheet_name='Sale In 2024',header=1)

        # Trim extra space in header names
        df.columns = df.columns.str.strip()
        df.rename(columns=column_mapping, inplace=True)

        # Cast Column product SKU to be a Text
        df['CJ_Article'] = df['CJ_Article'].astype(str)
        df['SHM_Article'] = df['SHM_Article'].astype(str)
        
        # Convert data to datet time 
        convert_excel_date(df, ['วันที่จัดส่งสินค้า', 'SHM PO Date'])

        # Filter the rows based on conditions
        filtered_data = df[(df['CJ_Article'].notnull()) & 
                           (df['Sold to'] == 'CJ') & 
                           (df['สถานะการจัดส่งสินค้า'] == 'PENDING')]

        # Select the columns you need after filtering
        selected_columns = filtered_data[['CJ_Article',
                                           'SHM_Article',
                                           'SHM PO Date',
                                           'SHM PO NO.',
                                           'CJ PO NO.',
                                           'Product Name',
                                           'Supplier',
                                           'Sold to',
                                           'สถานที่จัดส่งสินค้า',
                                           'หน่วยบรรจุ (ชิ้น/ลัง)',
                                           'จำนวนเปิด PO สหมิตร (ลัง)',
                                           'วันที่จัดส่งสินค้า',
                                           'สถานะการจัดส่งสินค้า']]

        # Add new columns using .loc to avoid SettingWithCopyWarning
        selected_columns.loc[:, 'DC'] = selected_columns['สถานที่จัดส่งสินค้า'].map({
            'D001': 'DC1',
            'D002': 'DC2',
            'D004': 'DC4'
        })
        selected_columns.loc[:, 'จำนวนเปิด PO สหมิตร (ชิ้น)'] = selected_columns['จำนวนเปิด PO สหมิตร (ลัง)'] * selected_columns['หน่วยบรรจุ (ชิ้น/ลัง)']


        # Pivot data
        pivoted_df = selected_columns.pivot_table(
            index=['CJ_Article', 'SHM_Article'],
            columns=['DC'],
            values='จำนวนเปิด PO สหมิตร (ชิ้น)',
            aggfunc='sum',
            fill_value=0
        ).reset_index()

        pivoted_df.columns = ['CJ_Item', 'SHM_Item'] + [f'PO_Qty_to_{col}' for col in pivoted_df.columns[2:]]

        pivoted_min_del_date = selected_columns.pivot_table(
            index=['CJ_Article', 'SHM_Article'],
            columns='DC',
            values='วันที่จัดส่งสินค้า',
            aggfunc='min'
        ).reset_index()

        # Rename column in pivot
        pivoted_min_del_date.columns = ['CJ_Item', 'SHM_Item'] + [f'Min_del_date_to_{col}' for col in pivoted_min_del_date.columns[2:]]

        # Merge 2 pivot tables for Min Del date
        merged_df = pd.merge(pivoted_df, pivoted_min_del_date, on=['CJ_Item', 'SHM_Item'], how='left')

        year_filter = merged_df.filter(like='Min_del_date_to_').apply(lambda x: x.dt.year >= 2024)
        merged_df = merged_df[year_filter.any(axis=1)]

        print(f"Checking data type of each column: {merged_df.dtypes}")

        # Save the cleaned data as an .xlsx file
        with pd.ExcelWriter(save_path, mode='w') as writer:
            merged_df.to_excel(writer, sheet_name='Pivot HBA', index=False)
            selected_columns.to_excel(writer, sheet_name='cleaned data', index=False)
        print(f"Cleaned data saved to {save_path}")

    except Exception as e:
        print(f"Error processing the Excel file: {e}")

# Call the function if the script is run directly
if __name__ == "__main__":
    clean_PO_HBA_file()


File is available. Proceeding the processes
Checking data type of each column: CJ_Item                        object
SHM_Item                       object
PO_Qty_to_DC1                  object
PO_Qty_to_DC2                  object
PO_Qty_to_DC4                  object
Min_del_date_to_DC1    datetime64[ns]
Min_del_date_to_DC2    datetime64[ns]
Min_del_date_to_DC4    datetime64[ns]
dtype: object
Cleaned data saved to D:\Data for Stock Report\cleaned_PO_pending_HBA.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_columns.loc[:, 'DC'] = selected_columns['สถานที่จัดส่งสินค้า'].map({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_columns.loc[:, 'จำนวนเปิด PO สหมิตร (ชิ้น)'] = selected_columns['จำนวนเปิด PO สหมิตร (ลัง)'] * selected_columns['หน่วยบรรจุ (ชิ้น/ลัง)']
