In [None]:
import pandas as pd

df = pd.read_csv("../data/cleaned/cleaned_amazon_data.csv")
df.head()


In [None]:
df["Order ID"] = df["Order ID"].str.replace("-", "", regex=False)

In [None]:
df["Date"] = df["Date"].str.replace("-", "", regex=False)

In [None]:
df["Status"].value_counts()

In [None]:
status_map = {
    # Delivered
    "Shipped": "Delivered",
    "Shipped - Delivered to Buyer": "Delivered",
    "Shipped - Picked Up": "Delivered",
    "Shipped - Out for Delivery": "Delivered",

    # Cancelled
    "Cancelled": "Cancelled",
    "Pending": "Cancelled",
    "Pending - Waiting for Pick Up": "Cancelled",

    # Returned
    "Shipped - Returned to Seller": "Returned",
    "Shipped - Returning to Seller": "Returned",
    "Shipped - Rejected by Buyer": "Returned",
    "Shipped - Lost in Transit": "Returned",
    "Shipped - Damaged": "Returned",
}


In [None]:
df["delivery_status"] = df["Status"].map(status_map)

In [None]:
df = df.dropna(subset=["delivery_status"])

In [None]:
df["delivery_status"].value_counts()

In [None]:
y = df["delivery_status"]

X = df.drop(columns=["delivery_status", "Status", "Courier Status", "Order ID"], errors="ignore")

In [None]:
X = df.drop(columns=["delivery_status", "Status", "Courier Status", "Order ID"], errors="ignore")

cat_cols = X.select_dtypes(exclude="number").columns
X[cat_cols].nunique().sort_values(ascending=False).head(20)


In [None]:
#Dropping categoricals with too many unique values
high_card_cols = X[cat_cols].nunique()[lambda s: s > 50].index
high_card_cols

In [None]:
X_small = X.drop(columns=high_card_cols)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

y = df["delivery_status"]

X_encoded = pd.get_dummies(X_small, drop_first=False)

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
accuracy



In [None]:
orange = "#ed7d31"

In [None]:
import matplotlib.pyplot as plt

status_counts = df["delivery_status"].value_counts()

plt.bar(status_counts.index, status_counts.values, color = orange)
plt.xlabel("Order Status")
plt.ylabel("Number of Orders")
plt.title("Distribution of Order Delivery Status")
plt.show()


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

The model predicts delivered orders very accurately, but struggles to identify cancelled and returned orders because they are much less common in the data.

KNN struggles because rare cancelled and returned orders are overwhelmed by nearby delivered orders in the data (Imbalanced)

In [None]:
from sklearn.metrics import confusion_matrix
LR_confusion_matrix = confusion_matrix(y_test,y_pred)  #Always Actual,Predicted
LR_confusion_matrix

In [None]:
train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)

In [None]:
train["delivery_status"] = y_train.values

In [None]:
Delivered = train[train["delivery_status"] == 'Delivered']
Cancelled = train[train["delivery_status"] == 'Cancelled']
Returned = train[train["delivery_status"] == 'Returned']

In [None]:
from sklearn.utils import resample
no_undersampled = resample(Delivered,
                                    replace=False,
                                    n_samples = len(Cancelled),
                                    random_state=0)
no_undersampled

In [None]:
train_under = pd.concat([no_undersampled, Cancelled, Returned])
train_under

In [None]:
import matplotlib.pyplot as plt

status_counts = train_under['delivery_status'].value_counts()

plt.bar(status_counts.index, status_counts.values, color=orange)
plt.xlabel("Order Status")
plt.ylabel("Number of Orders")
plt.title("Distribution of Order Delivery Status")
plt.show()

In [None]:
x_train = train_under.drop(['delivery_status'], axis = 1)

In [None]:
y_train = train_under['delivery_status']

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
LR_confusion_matrix = confusion_matrix(y_test,y_pred)  #Always Actual,Predicted
LR_confusion_matrix

More true to data. better to work with

Future rec, run other classification models eg- Logistic regression, decision tree classifier 

Why different results for different models

In [None]:
x_train.head()