In [2]:
import csv

# Removes all none ASCII characters and replaces them with spaces
with open('pickHistoryOrderNo.csv', 'r', encoding='utf-8', errors='ignore') as file_in, open('output_file.csv', 'w', newline='', encoding='utf-8') as file_out:
    reader = csv.reader(file_in)
    writer = csv.writer(file_out)

    # Write the header row to the output file
    header = next(reader)
    writer.writerow(header)

    # Iterate over the rows in the input file and clean each row before writing it to the output file
    for row in reader:
        cleaned_row = []
        for cell in row:
            # Replace any non-unicode characters with spaces
            cleaned_cell = ''.join([char if ord(char) < 128 else ' ' for char in cell])
            cleaned_row.append(cleaned_cell)
        writer.writerow(cleaned_row)


In [3]:
# Remove whitespace and tabs

import csv

with open('output_file.csv', 'r', encoding='utf-8', errors='ignore') as file_in, open('leanPicks.csv', 'w', newline='', encoding='utf-8') as file_out:
    reader = csv.reader(file_in, delimiter='\t')
    writer = csv.writer(file_out)

    # Write the header row to the output file
    header = next(reader)
    header = [col.replace(' ', '').strip() for col in header]  # Remove whitespace and tab in column names
    writer.writerow(header)

    # Iterate over the rows in the input file, remove any whitespace or tabs, and write to the output file
    for row in reader:
        cleaned_row = [cell.replace(' ', '').strip() for cell in row]
        writer.writerow(cleaned_row)


In [4]:
# List refactored column headers

import pandas as pd

df = pd.read_csv("output_file.csv", on_bad_lines="skip")
df.columns.values.tolist()

['ISELL_ORDER_NUMBER\tPICK_ID\tARTNO   \tARTNAME_UNICODE                                   \tART_VOLUME_M3\tORDERED_QTY\tPICKED_QTY\tOPEN_PICK_QTY\tAVAILABLE_STOCK\tPOSSIBLE_TO_FINISH\tORDER_TYPE \tDATE_OF_PAYMENT\tTIME_OF_PAYMENT\tEXCEPTION\tPICK_AREA          \tACTUAL_ORDER_STATUS\tSTORAGE_STATUS\tSTORAGE_USED\tHANDOVER_POINT\tCUT_OFF_DATE\tCUT_OFF_TIME\tUSER_PICKING\tSERVICE_DATE\tSERVICE_WINDOW\tORDER_METHOD\tPICK_LOCATION\tPICK_LOCATION_TYPE\tDELIVERY_METHOD                  ']

In [5]:
# Create a new csv that contains only the columns we are interested in

# Read the CSV file into a DataFrame and specify a custom data type for the 'ARTNO' column
df = pd.read_csv('leanPicks.csv', usecols=['ISELL_ORDER_NUMBER', 'PICK_ID', 'ARTNAME_UNICODE', 'ORDER_TYPE', 'PICK_AREA', 'ARTNO'], dtype={'ARTNO': str})

# Write the selected columns to a new CSV file
df.to_csv('output_file.csv', index=False)


In [6]:
# Create a new csv that has all orders and the articles attributed to that order

import pandas as pd

# Read csv file
df = pd.read_csv('output_file.csv', dtype={'ARTNO': str})

# Convert ARTNO to string
df['ARTNO'] = df['ARTNO'].astype(str)

# Group by order number and concatenate article numbers
orders = df.groupby('ISELL_ORDER_NUMBER')['ARTNO'].apply(lambda x: ','.join(x)).reset_index()

# Rename columns
orders.columns = ['order_number', 'articles']

# Write to new csv file
orders.to_csv('orders.csv', index=False)



In [7]:
# Strip quotation marks
import csv


with open('orders.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    with open('strippedOrders.csv', 'w', newline='') as newfile:
        csvwriter = csv.writer(newfile)

        for row in csvreader:
            newrow = [cell.replace('"', '') for cell in row]
            csvwriter.writerow(newrow)


In [8]:
# Returns all articles that are included in an order. 

x = 1345368239

df = pd.read_csv('strippedOrders.csv', dtype={'ARTNO': str})

filtered_df = df[df['order_number'] == x]

articles = filtered_df.iloc[0]['articles'].split(",")

print(articles)

['20214566', '30256891', '20246708', '70277957', '70214564', '40256895', '70346932', '30246722']


In [9]:
# List of all unique articles

# Split the values in the 'articles' column based on the comma separator, and create a new row for each value
df['articles'] = df['articles'].str.split(',')
df = df.explode('articles')

# Get a list of unique values in the 'articles' column
unique_articles = df['articles'].unique().tolist()

# Write the list of unique articles to a new CSV file
pd.DataFrame({'articles': unique_articles}).to_csv('unique_articles.csv', index=False)

In [10]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('unique_articles.csv')

# Preprocess the 'ARTNO' column to add leading zeroes to the column values
df['articles'] = df['articles'].apply(lambda x: f'{x:0>8}')

# Iterate through each row of the DataFrame and each column of the row, and print the value of each cell
for index, row in df.iterrows():
    for column, value in row.items():
        orderdf = pd.read_csv('orders.csv')
        filtered_df = orderdf[orderdf['articles'].str.contains(value, na=False, case=False)]
        print("picks containing ", value, filtered_df['articles'].values)


picks containing  70518513 ['70518513']
picks containing  10242739 ['10242739']
picks containing  00527158 ['00527158']
picks containing  60532208 ['60532208']
picks containing  20399895 ['20399895']
picks containing  80251513 ['80251513']
picks containing  70532062 ['70532062'
 '00177225,00542998,20132453,20448834,20508408,20543001,30213552,40323654,40508407,40508884,50213546,50512649,70213545,70466811,70532062,90213549,90279719,90538001,20312492,40448036']
picks containing  60509142 ['60509142'
 '70509132,30214504,20510939,90509145,10509125,60509142,80256940,10263232,40257296'
 '80256940,60509142']
picks containing  60185034 ['60185034,60503569' '00458697,50458831,60185034,90472340']
picks containing  60503569 ['60185034,60503569']
picks containing  20214566 ['20214566,30256891,20246708,70277957,70214564,40256895,70346932,30246722'
 '10446883,10529086,60342656,60529084,70529088,10446557,00214505,10368489,20214566,20329195,30256891,30277959,70214559,70246386,70277957,70346932,80331930

In [20]:
import csv
import pandas as pd

# Must be stored as string
seen = ['20214566']

df = pd.read_csv('orders.csv')

# Gets all picks that contain a certain article
def findPicksWithArticle(df, x):
    # Filter the DataFrame to only include rows where the 'articles' column contains the specified value
    filtered_df = df[df['articles'].str.contains(x, na=False, case=False)]
    # Select the 'articles' column from the filtered DataFrame and return it
    artno_column = filtered_df[['articles']]

    relevant_picks = []

    for pick in artno_column.values:
        pickList = str(pick).strip('[]')
        pickList = pickList.strip("'")
        pickList = pickList.split(',')
        
        relevant_picks.append(pickList)

    return relevant_picks

# Returns the number of picks with an occurrence of an article
def findArticleOccurrences(picks, article):
    occurrences = 0

    for pick in picks:
        if article in pick:
            occurrences += 1

    return occurrences

def findCorrelation(df, article):
    XPicks = findPicksWithArticle(df, article)

    unique_articles = set()

    for order in XPicks:
        for item in order:
            unique_articles.add(item)

    unique_articles.remove(article)

    # Create an empty dictionary to store the correlation values for each article pair
    correlation_dict = {}

    for item in unique_articles:
        # Find all occurrences of item in the set of all picks containing the article
        subOccurrences = findArticleOccurrences(XPicks, item)
        # Express as a probability
        correlation = round((subOccurrences / len(XPicks)), 4) * subOccurrences
        # Add the calculated correlation value to the dictionary with the article pair as the key
        correlation_dict[(article, item)] = correlation

    return correlation_dict

with open('unique_articles.csv') as file_obj:
    correlation_obj = {}
    # Create reader object 
    reader_obj = csv.reader(file_obj)

    # Iterate over each row in the csv
    next(file_obj)
    for row in reader_obj:
        article_name = row[0]
        correlation_values = findCorrelation(df, article_name)
        XPicksLength = len(findPicksWithArticle(df, article_name))

        # Multiply the correlation value by XPicksLength for each row in the dictionary
        for article_pair, correlation in correlation_values.items():
            correlation_values[article_pair] = correlation * XPicksLength

        if correlation_values:
            correlation_obj.update(correlation_values)

# Create a correlation matrix and add new column
corr_matrix = pd.DataFrame.from_dict(correlation_obj, orient='index')
corr_matrix.columns = ['Correlation']

# Create separate columns for articles
corr_matrix[['Article 1', 'Article 2']] = pd.DataFrame(corr_matrix.index.tolist(), index=corr_matrix.index)

# Write correlation matrix to CSV file
corr_matrix.to_csv('correlation_matrix.csv', index=False)
