# Feature Engineering (clean, transform, add fields, remove, etc)
# to train model well efficiently (increase performance, accuracy)

In [33]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [34]:
data = pd.DataFrame({
    'order_date': ['2025-05-31', '2025-06-01', '2025-06-02'],
    'shipping_duration': [2, 5, 3],
    'shipping_type': ['express', 'standard', 'standard'],
    'is_late': [0, 1, 0]  # target
})

In [35]:
data.head()

Unnamed: 0,order_date,shipping_duration,shipping_type,is_late
0,2025-05-31,2,express,0
1,2025-06-01,5,standard,1
2,2025-06-02,3,standard,0


In [36]:
data['order_date'] = pd.to_datetime(data['order_date'])

In [37]:
data

Unnamed: 0,order_date,shipping_duration,shipping_type,is_late
0,2025-05-31,2,express,0
1,2025-06-01,5,standard,1
2,2025-06-02,3,standard,0


In [38]:
data['order_weekday'] = data['order_date'].dt.weekday

In [39]:
data

Unnamed: 0,order_date,shipping_duration,shipping_type,is_late,order_weekday
0,2025-05-31,2,express,0,5
1,2025-06-01,5,standard,1,6
2,2025-06-02,3,standard,0,0


In [40]:
data['is_weekend'] = data['order_date'].isin([5,6]).astype(int)

In [41]:
data

Unnamed: 0,order_date,shipping_duration,shipping_type,is_late,order_weekday,is_weekend
0,2025-05-31,2,express,0,5,0
1,2025-06-01,5,standard,1,6,0
2,2025-06-02,3,standard,0,0,0


In [42]:
from sys import prefix


shipping_dummies = pd.get_dummies(data['shipping_type'], prefix='type')
data = pd.concat([data, shipping_dummies], axis=1)

In [43]:
data

Unnamed: 0,order_date,shipping_duration,shipping_type,is_late,order_weekday,is_weekend,type_express,type_standard
0,2025-05-31,2,express,0,5,0,True,False
1,2025-06-01,5,standard,1,6,0,False,True
2,2025-06-02,3,standard,0,0,0,False,True


In [44]:
X = data[['shipping_duration','order_weekday', 'is_weekend','type_express','type_standard']]
y = data['is_late']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [46]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [47]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 1.0
