Look at readme file first

1- Using the tabular data and conventional method for Risk Score Analysis

The results are called Tabular data analysis results to compare the result of text analysis with others.

In [14]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

# Ensure the plots open in the browser
pio.renderers.default = "browser"

# Load the data
supply_data = pd.read_csv("supply_chain_data.csv")

# Check for missing values
missing_values = supply_data.isnull().sum()
missing_values = missing_values[missing_values > 0]

if not missing_values.empty:
    print("Columns with missing values:")
    for column, count in missing_values.items():
        print(f"{column}: {count} missing values")
else:
    print("There are no columns with missing values.")

# Check for duplicate rows
if supply_data.duplicated().any():
    print(f"There are {supply_data.duplicated().sum()} duplicate data points.")
else:
    print("There are no duplicate data.")

# Compute mean defect rates per product type
defect_rates_by_product = supply_data.groupby("Product type")['Defect rates'].mean().reset_index()

# Create a bar chart for defect rates
fig = px.bar(defect_rates_by_product, x='Product type', y='Defect rates', title='Defect Rates by Product Type')

# Use a valid color assignment
color_scale = px.colors.qualitative.Set3
fig.update_traces(marker=dict(color=color_scale[:len(defect_rates_by_product)]))

# Customize the chart appearance
fig.update_layout(
    xaxis_title="Product Type",
    yaxis_title="Mean Defect Rates",
    xaxis=dict(categoryorder='total descending'),
    plot_bgcolor='white',
    title_x=0.5,
    showlegend=False
)

# Show the plot in the browser
fig.show()

# Risk Analysis
risk_data = supply_data[['SKU', 'Lead times', 'Stock levels']].copy()
risk_data['Risk score'] = risk_data['Lead times'] * (1 - risk_data['Stock levels'])

# Get top 10 highest-risk SKUs
risk_data = risk_data.sort_values(by='Risk score', ascending=False).head(10)

# Create a bar plot for high-risk SKUs
fig = px.bar(risk_data, x='SKU', y='Risk score', title='Top 10 Highest-Risk Data',
             labels={'Risk score': 'Risk Score', 'SKU': 'SKU'},
             text='Risk score')

# Customize the appearance
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(xaxis_title='SKU', yaxis_title='Risk Score', title_x=0.5)

# Show the plot in the browser
fig.show()


There are no columns with missing values.
There are no duplicate data.


2- Using the textual data and rule-based regex extraction method for Risk Score Analysis from text



In [15]:
import re
import pandas as pd
import plotly.express as px

# Load the text data
with open("supply_chain_report.txt", "r") as file:
    text_data = file.readlines()

# Define regex patterns for reliable extraction
sku_pattern = r"known as (\w+)"
lead_time_pattern = r"lead time of (\d+) days"
stock_level_pattern = r"stock level currently stands at (\d+) units"

# Extract structured data using NLP
sku_list, lead_time_list, stock_level_list = [], [], []

for line in text_data:
    sku_match = re.search(sku_pattern, line)
    lead_time_match = re.search(lead_time_pattern, line)
    stock_level_match = re.search(stock_level_pattern, line)

    if sku_match and lead_time_match and stock_level_match:
        sku_list.append(sku_match.group(1))
        lead_time_list.append(int(lead_time_match.group(1)))
        stock_level_list.append(int(stock_level_match.group(1)))

# Ensure lists have equal lengths
min_length = min(len(sku_list), len(lead_time_list), len(stock_level_list))
sku_list, lead_time_list, stock_level_list = sku_list[:min_length], lead_time_list[:min_length], stock_level_list[:min_length]

# Create a DataFrame with corrected values
corrected_risk_data = pd.DataFrame({
    "SKU": sku_list,
    "Lead times": lead_time_list,
    "Stock levels": stock_level_list
})

#  Apply the correct Risk Score formula
corrected_risk_data["Risk Score"] = corrected_risk_data["Lead times"] * (1 - corrected_risk_data["Stock levels"])

# Save the corrected values to CSV
corrected_risk_data.to_csv("corrected_risk_scores.csv", index=False)

# Retrieve the correct Risk Score for SKU68
sku_68_corrected_risk_score = corrected_risk_data[corrected_risk_data["SKU"] == "SKU68"]["Risk Score"].values
print(f" Corrected Risk Score for SKU68: {sku_68_corrected_risk_score}")

# Sort the dataset by Risk Score in descending order and select the top 10 high-risk SKUs
top_corrected_risk_data = corrected_risk_data.sort_values(by="Risk Score", ascending=False).head(10)

# Display the corrected results
print(top_corrected_risk_data)

# Generate the corrected visualization
fig = px.bar(
    top_corrected_risk_data, x="SKU", y="Risk Score", title="Corrected Top 10 Highest-Risk Data",
    labels={"Risk Score": "Risk Score", "SKU": "SKU"}, text="Risk Score"
)

# Customize plot appearance
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(xaxis_title="SKU", yaxis_title="Risk Score", title_x=0.5)

# Show the corrected plot
fig.show()


 Corrected Risk Score for SKU68: [8]
      SKU  Lead times  Stock levels  Risk Score
68  SKU68           8             0           8
2    SKU2          10             1           0
34  SKU34          26             1           0
16  SKU16           5             2          -5
4    SKU4           3             5         -12
78  SKU78           4             5         -16
87  SKU87           7             5         -28
76  SKU76           1            38         -37
8    SKU8          10             5         -40
47  SKU47          15             4         -45


3- Using the textual data and BERT-based Named Entity Recognition (NER) method for Risk Score Analysis from text



In [17]:
import re
import spacy


# Load the text file and print the first few lines
with open("supply_chain_report.txt", "r") as file:
    text_data = file.readlines()

# Print a few lines for verification
print("ðŸ”¹ First 5 lines of the text data:")
for line in text_data[:5]:
    print(line)


import spacy

# Load spaCyâ€™s NLP model
nlp = spacy.load("en_core_web_sm")

# Process a sample line from text_data
sample_text = text_data[0]  # Take the first sentence
doc = nlp(sample_text)

# Print named entities detected
print("\nðŸ”¹ Named Entities detected in sample text:")
for ent in doc.ents:
    print(f"Text: {ent.text} | Label: {ent.label_}")





# Load spaCyâ€™s NLP model
nlp = spacy.load("en_core_web_sm")

# Define regex patterns for structured data
sku_pattern = r"known as (\w+)"
lead_time_pattern = r"lead time of (\d+) days"
stock_level_pattern = r"stock level currently stands at (\d+) units"

# Define required SKUs
required_skus = {"SKU68", "SKU2", "SKU34", "SKU16", "SKU4", "SKU78", "SKU87", "SKU76", "SKU8", "SKU47"}

# Initialize empty lists
sku_list, lead_time_list, stock_level_list = [], [], []

# Process each line in the text data
for line in text_data:
    doc = nlp(line)

    # Extract structured values using regex
    sku_match = re.search(sku_pattern, line)
    lead_time_match = re.search(lead_time_pattern, line)
    stock_level_match = re.search(stock_level_pattern, line)

    sku = sku_match.group(1) if sku_match else None
    lead_time = int(lead_time_match.group(1)) if lead_time_match else None
    stock_level = int(stock_level_match.group(1)) if stock_level_match else None

    # Only store data for required SKUs
    if sku in required_skus and lead_time is not None and stock_level is not None:
        sku_list.append(sku)
        lead_time_list.append(lead_time)
        stock_level_list.append(stock_level)

# Print extracted values to verify
print("\nðŸ”¹ Extracted Values (Fixed Approach):")
print(f"SKUs: {sku_list}")
print(f"Lead Times: {lead_time_list}")
print(f"Stock Levels: {stock_level_list}")



import pandas as pd

# Create a DataFrame
ai_risk_data = pd.DataFrame({
    "SKU": sku_list,
    "Lead times": lead_time_list,
    "Stock levels": stock_level_list
})

# Calculate the Risk Score
ai_risk_data["Risk Score"] = ai_risk_data["Lead times"] * (1 - ai_risk_data["Stock levels"] / 100)

# Print DataFrame to verify
print("\nðŸ”¹ Final Extracted Data:")
print(ai_risk_data.head())

import plotly.express as px

# Generate Bar Plot
fig_ai = px.bar(
    ai_risk_data, x="SKU", y="Risk Score",
    title="Top Risk Scores (AI-Based Extraction - spaCy NER)",
    labels={"Risk Score": "Risk Score", "SKU": "SKU"}, text="Risk Score"
)
fig_ai.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig_ai.update_layout(xaxis_title="SKU", yaxis_title="Risk Score", title_x=0.5)
fig_ai.show()





ðŸ”¹ First 5 lines of the text data:
The product, known as SKU0, belongs to the haircare category. It is priced at $69.81 and has an availability rating of 55%. The total number of units sold is 802, generating a revenue of $8662.00. This product is primarily purchased by customers in the Non-binary category. The stock level currently stands at 58 units, with an average lead time of 7 days. The recent order quantities have been 96 units, while production volumes stand at 215 units. The product is inspected under the 'Pending' criteria and has a defect rate of 0.2264103608499251%. It is shipped using Road and follows the Route B route. The overall shipping costs for this product amount to $187.75. This product is supplied by Supplier 3 and is distributed from Mumbai. The lead time from suppliers is 29 days, and the manufacturing lead time is 29 days. The total manufacturing cost per unit is $46.28. These factors collectively determine the supply chain efficiency and risk factors for SKU