# **Nguyễn Thị Phương Mai _ K224131544**

Import thư viện

In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import urllib.parse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

Đọc dữ liệu

In [5]:
sheet_url = "https://docs.google.com/spreadsheets/d/1Oz0jxSi_BzhW5-iXhILvMiMirjAQYKb2/edit?usp=sharing&ouid=117075034744329459810&rtpof=true&sd=true"
url_parts = urllib.parse.urlparse(sheet_url)
sheet_id = url_parts.path.split("/")[3]
sheet_name = "Sheet1"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

df = pd.read_csv(url)

print(df.head())

  Payment Type  Days for shipment (real)  Days for shipment (scheduled)  \
0        DEBIT                         3                              4   
1     TRANSFER                         5                              4   
2         CASH                         4                              4   
3        DEBIT                         3                              4   
4      PAYMENT                         2                              4   

    Delivery Status  Late_delivery_risk           Customer Street  \
0  Advance shipping                   0  5365 Noble Nectar Island   
1     Late delivery                   1          2679 Rustic Loop   
2  Shipping on time                   0      8510 Round Bear Gate   
3  Advance shipping                   0           3200 Amber Bend   
4  Advance shipping                   0  8671 Iron Anchor Corners   

  Customer City  Customer Zipcode Customer Country Customer State  ...  \
0        Caguas               725      Puerto Rico          

Dự đoán 'Late_delivery_risk' dựa trên 'Customer Country', 'Customer State', 'Customer City', 'Customer Segment', 'Order City', 'Order State', 'Order Country', 'Shipping Mode', 'Department Name', 'Category Name', 'Product Name'

In [6]:
X = df[['Customer Country', 'Customer State', 'Customer City', 'Customer Segment', 'Order City', 'Order State', 'Order Country', 'Shipping Mode', 'Department Name', 'Category Name', 'Product Name']]
y = df['Late_delivery_risk']

X = pd.get_dummies(X, columns=['Customer Country', 'Customer State', 'Customer City', 'Customer Segment', 'Order City', 'Order State', 'Order Country', 'Shipping Mode', 'Department Name', 'Category Name', 'Product Name'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000),  # Increased max_iter
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "NaiveBayes": GaussianNB()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        "accuracy": accuracy,
        "classification_report": classification_report(y_test, y_pred)
    }
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

--- RandomForest ---
Accuracy: 0.6857142857142857
              precision    recall  f1-score   support

           0       0.65      0.83      0.73       234
           1       0.75      0.53      0.62       221

    accuracy                           0.69       455
   macro avg       0.70      0.68      0.68       455
weighted avg       0.70      0.69      0.68       455

--- LogisticRegression ---
Accuracy: 0.6967032967032967
              precision    recall  f1-score   support

           0       0.68      0.79      0.73       234
           1       0.73      0.60      0.66       221

    accuracy                           0.70       455
   macro avg       0.70      0.69      0.69       455
weighted avg       0.70      0.70      0.69       455

--- SVM ---
Accuracy: 0.6857142857142857
              precision    recall  f1-score   support

           0       0.64      0.88      0.74       234
           1       0.79      0.48      0.60       221

    accuracy                       

Chọn model tốt nhất

In [7]:
best_model_name = max(results, key=lambda k: results[k]["accuracy"])
print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {results[best_model_name]['accuracy']}")
print(f"Classification Report:\n{results[best_model_name]['classification_report']}")


Best Model: LogisticRegression
Accuracy: 0.6967032967032967
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.79      0.73       234
           1       0.73      0.60      0.66       221

    accuracy                           0.70       455
   macro avg       0.70      0.69      0.69       455
weighted avg       0.70      0.70      0.69       455



Lưu model tốt nhất

In [8]:
import joblib
best_model = models[best_model_name]
joblib.dump(best_model, 'best_model.pkl')
print(f"\nModel tốt nhất đã được lưu thành best_model.pkl")


Model tốt nhất đã được lưu thành best_model.pkl


In [9]:
loaded_model_sales = joblib.load('best_model.pkl')
from google.colab import files
files.download('best_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Tạo data cho selectbox

In [10]:
unique_cols = ['Customer Country', 'Customer State', 'Customer City', 'Customer Segment',
                'Order City', 'Order State', 'Order Country', 'Shipping Mode',
                'Department Name', 'Category Name', 'Product Name']

unique_df = df[unique_cols].drop_duplicates()

unique_df.to_csv('unique_values.csv', index=False)
from google.colab import files
files.download('unique_values.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Load best *model*

In [11]:
!pip install streamlit



In [12]:
import streamlit as st
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler

try:
    best_model = joblib.load("best_model.pkl")
    st.write("Mô hình đã được tải thành công!")
except FileNotFoundError as e:
    st.write(f"Lỗi: Không tìm thấy tệp ({e})")
    st.stop()
except Exception as e:
    st.write(f"Đã có lỗi khi tải mô hình hoặc scaler: {e}")
    st.stop()


st.title("Ứng dụng Dự đoán Rủi ro giao hàng trễ")

st.header("Nhập thông tin đơn hàng:")

try:
    unique_values = pd.read_csv('unique_values.csv')
except FileNotFoundError:
    st.error("File unique_values.csv not found. Please make sure it is in the same directory as your script.")
    st.stop()
except Exception as e:
    st.error(f"An error occurred while loading unique_values.csv: {e}")
    st.stop()

customer_country = st.selectbox("Customer Country", unique_values['Customer Country'].unique())
customer_state = st.selectbox("Customer State", unique_values['Customer State'].unique())
customer_city = st.selectbox("Customer City", unique_values['Customer City'].unique())
customer_segment = st.selectbox("Customer Segment", unique_values['Customer Segment'].unique())
order_city = st.selectbox("Order City", unique_values['Order City'].unique())
order_state = st.selectbox("Order State", unique_values['Order State'].unique())
order_country = st.selectbox("Order Country", unique_values['Order Country'].unique())
shipping_mode = st.selectbox("Shipping Mode", unique_values['Shipping Mode'].unique())
department_name = st.selectbox("Department Name", unique_values['Department Name'].unique())
category_name = st.selectbox("Category Name", unique_values['Category Name'].unique())
product_name = st.selectbox("Product Name", unique_values['Product Name'].unique())

if st.button("Dự đoán Rủi ro giao hàng trễ"):
    input_data = pd.DataFrame({
        'Customer Country': [customer_country],
        'Customer State': [customer_state],
        'Customer City': [customer_city],
        'Customer Segment': [customer_segment],
        'Order City': [order_city],
        'Order State': [order_state],
        'Order Country': [order_country],
        'Shipping Mode': [shipping_mode],
        'Department Name': [department_name],
        'Category Name': [category_name],
        'Product Name': [product_name]
    })

    input_data_scaled = scaler.transform(input_data)

    prediction_profit = best_model.predict(input_data_scaled)

    st.write(f"Dự đoán Rủi ro giao hàng trễ: {prediction_profit[0]}")

2024-11-07 17:43:11.656 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-11-07 17:43:11.700 Session state does not function when running a script without `streamlit run`


[Mô hình trên streamlit](https://)