In [1]:
import pandas as pd
import numpy as np
import re
import csv


In [2]:
data = pd.read_excel('./data/filtered_30_filled_money.xlsx')

In [3]:
def remove_restricted_text(text):
    return re.sub(r'\(사용금지\)', '', text).strip()

# Remove "(사용금지)" from '청구품목' and '발주처' columns
data['청구품목'] = data['청구품목'].apply(remove_restricted_text)
data['발주처'] = data['발주처'].apply(remove_restricted_text)


# Check the cleaned data
print(data[['청구품목', '발주처']].head())

                                                청구품목  \
0                         GE POWER PACK FORK - E7(B)   
1                         GE POWER PACK FORK - E7(B)   
2  SAMSON SUPER STRONG DOUBLE BRAID ROPE 1 3/4", ...   
3        WIRE ROPE G)6X(S)19 A3 CMP SLPP 28MM X 400M   
4        WIRE ROPE G)6X(S)19 A3 CMP SLPP 25MM X 400M   

                         발주처  
0  MATSUI(U.S.A) COROPRATION  
1  MATSUI(U.S.A) COROPRATION  
2                        KTI  
3                    대광기업(주)  
4                    대광기업(주)  


In [4]:
machinery_df = data[['Machinery']].drop_duplicates().reset_index(drop=True)
machinery_df['machinery_id'] = machinery_df.index + 1


In [5]:
# Create assembly dataframe
assembly_df = data[['Machinery', 'Assembly']].drop_duplicates().reset_index(drop=True)
assembly_df = assembly_df.merge(machinery_df, on='Machinery')
assembly_df['assembly_id'] = assembly_df.index + 1
assembly_df.rename(columns={'Assembly': 'assembly_name'}, inplace=True)


In [6]:
# Check for any potential duplicate assemblies for the same machinery
duplicate_assemblies = assembly_df[assembly_df.duplicated(subset=['assembly_name'], keep=False)]
print(f"Duplicate Assemblies:\n{duplicate_assemblies}")


Duplicate Assemblies:
                              Machinery                       assembly_name  \
3                      CORKLINE STACKER                         TUBING ASSY   
4                 NO.1 GENERATOR ENGINE                      OVERHAUL PARTS   
7                 NO.1 GENERATOR ENGINE                          GASKET KIT   
11                NO.1 GENERATOR ENGINE  4N5111 PUMP GP-FUEL INJECTION PUMP   
16                NO.1 GENERATOR ENGINE   7N3036 CYLINDER HEAD GP-SIDE VIEW   
..                                  ...                                 ...   
397          MAIN BOOM PORT VANG WINCH                           WINCH ASSY   
398          MAIN BOOM PORT VANG WINCH                          COMMON PART   
399                         POWER BLOCK                         COMMON PART   
400                         MAIN ENGINE                        TURBOCHARGER   
401  NAVI DECK FWD NET BOAT DAVIT WINCH                          WINCH ASSY   

     machinery_id  assembly_i

In [7]:
# Remove duplicates in assembly_df to ensure unique assemblies
assembly_df.drop_duplicates(subset=['assembly_name'], inplace=True)


In [8]:
# Verify unique assembly count
unique_assembly_count = assembly_df['assembly_name'].nunique()
print(f"Unique Assembly Count (After removing duplicates): {unique_assembly_count}")


Unique Assembly Count (After removing duplicates): 209


In [9]:
claim_items_df = data[['Assembly', '청구품목', 'Part No.1', '견적화폐', '견적단가', '발주처']].reset_index(drop=True)


In [10]:
claim_items_df = claim_items_df.merge(assembly_df[['assembly_name', 'assembly_id']], left_on='Assembly', right_on='assembly_name', how='left')


In [11]:
# Rename columns
claim_items_df.rename(columns={
    '청구품목': 'item_name', 
    'Part No.1': 'part_no', 
    '견적화폐': 'currency', 
    '견적단가': 'price', 
    '발주처': 'supplier'
}, inplace=True)


In [12]:
# Drop duplicate rows if necessary in claim_items_df
claim_items_df.drop_duplicates(subset=['item_name', 'part_no', 'currency', 'price', 'supplier', 'assembly_id'], inplace=True)



In [13]:
claim_items_df['item_id'] = claim_items_df.index + 1


In [14]:
# Save to CSV in a valid local directory
machinery_df[['machinery_id', 'Machinery']].rename(columns={'Machinery': 'machinery_name'}).to_csv('machinery.csv', index=False, quoting=csv.QUOTE_MINIMAL)
assembly_df[['assembly_id', 'assembly_name', 'machinery_id']].to_csv('assembly.csv', index=False, quoting=csv.QUOTE_MINIMAL)
claim_items_df[['item_id', 'item_name', 'part_no', 'currency', 'price', 'supplier', 'assembly_id']].to_csv('claim_items.csv', index=False,  quoting=csv.QUOTE_MINIMAL)


In [15]:
# Print final counts for verification
print(f"Final Unique Assembly Count: {assembly_df['assembly_id'].nunique()}")
print(f"Final Unique Claim Items Count: {claim_items_df['item_id'].nunique()}")

Final Unique Assembly Count: 209
Final Unique Claim Items Count: 9312


In [16]:
# Create supplier dataframe
supplier_df = data[['발주처']].drop_duplicates().reset_index(drop=True)


In [17]:
# Assign a unique ID to each supplier
supplier_df['supplier_id'] = supplier_df.index + 1


In [18]:
supplier_df['발주처'] = supplier_df['발주처'].apply(remove_restricted_text)


In [19]:
# Rename columns
supplier_df.rename(columns={'발주처': 'supplier_name'}, inplace=True)

# Save the supplier table as a CSV file
supplier_df.to_csv('suppliers.csv', index=False, quoting=csv.QUOTE_MINIMAL)


In [20]:
# Check the supplier dataframe
print(supplier_df.head())

               supplier_name  supplier_id
0  MATSUI(U.S.A) COROPRATION            1
1                        KTI            2
2                    대광기업(주)            3
3                 K.TH MARCO            4
4   HAEIN Coporation_Cheonan            5


In [10]:
claim_items_df = pd.read_csv('./category_csv/claim_items.csv')

In [11]:
missing_assembly_id_rows = claim_items_df[claim_items_df['assembly_id'].isna()]

print(missing_assembly_id_rows)

Empty DataFrame
Columns: [item_id, item_name, part_no, currency, price, supplier, assembly_id]
Index: []
