In [54]:
import csv

# Removes all none ASCII characters and replaces them with spaces
with open('pickHistoryOrderNo.csv', 'r', encoding='utf-8', errors='ignore') as file_in, open('output_file.csv', 'w', newline='', encoding='utf-8') as file_out:
    reader = csv.reader(file_in)
    writer = csv.writer(file_out)

    # Write the header row to the output file
    header = next(reader)
    writer.writerow(header)

    # Iterate over the rows in the input file and clean each row before writing it to the output file
    for row in reader:
        cleaned_row = []
        for cell in row:
            # Replace any non-unicode characters with spaces
            cleaned_cell = ''.join([char if ord(char) < 128 else ' ' for char in cell])
            cleaned_row.append(cleaned_cell)
        writer.writerow(cleaned_row)


In [55]:
# Remove whitespace and tabs

import csv

with open('output_file.csv', 'r', encoding='utf-8', errors='ignore') as file_in, open('leanPicks.csv', 'w', newline='', encoding='utf-8') as file_out:
    reader = csv.reader(file_in, delimiter='\t')
    writer = csv.writer(file_out)

    # Write the header row to the output file
    header = next(reader)
    header = [col.replace(' ', '').strip() for col in header]  # Remove whitespace and tab in column names
    writer.writerow(header)

    # Iterate over the rows in the input file, remove any whitespace or tabs, and write to the output file
    for row in reader:
        cleaned_row = [cell.replace(' ', '').strip() for cell in row]
        writer.writerow(cleaned_row)


In [56]:
# List refactored column headers

import pandas as pd

df = pd.read_csv("output_file.csv", on_bad_lines="skip")
df.columns.values.tolist()

['ISELL_ORDER_NUMBER\tPICK_ID\tARTNO   \tARTNAME_UNICODE                                   \tART_VOLUME_M3\tORDERED_QTY\tPICKED_QTY\tOPEN_PICK_QTY\tAVAILABLE_STOCK\tPOSSIBLE_TO_FINISH\tORDER_TYPE \tDATE_OF_PAYMENT\tTIME_OF_PAYMENT\tEXCEPTION\tPICK_AREA          \tACTUAL_ORDER_STATUS\tSTORAGE_STATUS\tSTORAGE_USED\tHANDOVER_POINT\tCUT_OFF_DATE\tCUT_OFF_TIME\tUSER_PICKING\tSERVICE_DATE\tSERVICE_WINDOW\tORDER_METHOD\tPICK_LOCATION\tPICK_LOCATION_TYPE\tDELIVERY_METHOD                  ']

In [57]:
# Create a new csv that contains only the columns we are interested in

# Read the CSV file into a DataFrame and specify a custom data type for the 'ARTNO' column
df = pd.read_csv('leanPicks.csv', usecols=['ISELL_ORDER_NUMBER', 'PICK_ID', 'ARTNAME_UNICODE', 'ORDER_TYPE', 'PICK_AREA', 'ARTNO'], dtype={'ARTNO': str})

# Write the selected columns to a new CSV file
df.to_csv('output_file.csv', index=False)


In [58]:
# Create a new csv that has all orders and the articles attributed to that order

import pandas as pd

# Read csv file
df = pd.read_csv('output_file.csv', dtype={'ARTNO': str})

# Convert ARTNO to string
df['ARTNO'] = df['ARTNO'].astype(str)

# Group by order number and concatenate article numbers
orders = df.groupby('ISELL_ORDER_NUMBER')['ARTNO'].apply(lambda x: ','.join(x)).reset_index()

# Rename columns
orders.columns = ['order_number', 'articles']

# Write to new csv file
orders.to_csv('orders.csv', index=False)



In [59]:
# Strip quotation marks
import csv


with open('orders.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    with open('strippedOrders.csv', 'w', newline='') as newfile:
        csvwriter = csv.writer(newfile)

        for row in csvreader:
            newrow = [cell.replace('"', '') for cell in row]
            csvwriter.writerow(newrow)


In [63]:
# Returns all articles that are included in an order. 

x = 1345368239

df = pd.read_csv('strippedOrders.csv', dtype={'ARTNO': str})

filtered_df = df[df['order_number'] == x]

articles = filtered_df.iloc[0]['articles'].split(",")

print(articles)

['20214566', '30256891', '20246708', '70277957', '70214564', '40256895', '70346932', '30246722']


In [64]:
# List of all unique articles

# Split the values in the 'articles' column based on the comma separator, and create a new row for each value
df['articles'] = df['articles'].str.split(',')
df = df.explode('articles')

# Get a list of unique values in the 'articles' column
unique_articles = df['articles'].unique().tolist()

# Write the list of unique articles to a new CSV file
pd.DataFrame({'articles': unique_articles}).to_csv('unique_articles.csv', index=False)

In [69]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('unique_articles.csv')

# Preprocess the 'ARTNO' column to add leading zeroes to the column values
df['articles'] = df['articles'].apply(lambda x: f'{x:0>8}')

# Iterate through each row of the DataFrame and each column of the row, and print the value of each cell
for index, row in df.iterrows():
    for column, value in row.items():
        filtered_df = df[df['articles'].str.contains(value, na=False, case=False)]
        print(filtered_df[['articles']])


70518513
10242739
00527158
60532208
20399895
80251513
70532062
60509142
60185034
60503569
20214566
30256891
20246708
70277957
70214564
40256895
70346932
30246722
70277981
50277977
40180805
10304355
60272837
80214568
30277959
30214504
10442385
20214571
90277961
30263245
20257315
10301125
10446883
10529086
60342656
60529084
70529088
10446557
00214505
10368489
20329195
70214559
70246386
80331930
90256893
70256931
10357004
00246417
10246308
10257306
50214560
50330606
70246683
80410055
30529467
00103343
70403606
60442383
20481692
90246352
00257298
80470030
40295439
80221451
60263244
30382347
10458183
60211274
60246396
60431718
00497000
90027972
80474764
50242723
40459992
70290360
40318978
10277984
20201724
40201723
60473015
70246348
80256940
00452544
20331198
70509132
20510939
90509145
10509125
10263232
40257296
10205614
50356187
00228705
10486732
40480856
20423602
40233535
30195548
60400321
00431151
00275674
00295530
10244743
10423886
20245959
20485572
30291658
30341111
40263848
50263838
5

In [78]:

x = str(20214566)

# Read the CSV file into a DataFrame
df = pd.read_csv('orders.csv')

# Filter the DataFrame to only include rows where the 'ARTNO' column contains the specified value
filtered_df = df[df['articles'].str.contains(x, na=False, case=False)]

# Select the 'ARTNO' column from the filtered DataFrame and return it
artno_column = filtered_df[['articles']]

# Print the 'ARTNO' column containing the specified value(s)
print(artno_column.values)


[['20214566,30256891,20246708,70277957,70214564,40256895,70346932,30246722']
 ['10446883,10529086,60342656,60529084,70529088,10446557,00214505,10368489,20214566,20329195,30256891,30277959,70214559,70246386,70277957,70346932,80331930,90256893']
 ['00103343,70403606,00214505,20214566,30256891,60442383,70277957']
 ['20214566,30256891,70277957,90246352,30263245,00257298,00214505,60442383']
 ['20214566,70214559,70277957,30256891,90246352']
 ['00214505,00257298,20214566,30256891,30263245,30277959,70214559,70277957,80331930,90256893,10333065,20484817,50308196,50518477,50518910,60533057,70256931,70534645,90429771']
 ['10357004,00214505,20214566,30256891,60442383,70277957']
 ['00347510,00214505,20214566,40256942,70214559,80256940,90510945']
 ['70256931,80214549,20214566,30256891,30277959,40211977,40256895,40277992,50257253,70214559,70214564,70277957,70277995,70346932,80257261,80509481,90256893,90277961,90404822']
 ['20320802,20519888,50335618,80534372,00214505,10368489,20207410,20214566,2024636