<a href="https://colab.research.google.com/github/udhaya28031995/ecommerce-cart-abandonment-analysis/blob/main/primary_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ======== OPTIONAL: Upload kaggle.json ========
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{\r\n  "username": "udhayabalaji",\r\n  "key": "KGAT_8c53eca24a173978cfdc17b08f79f273"\r\n}'}

In [2]:


# ======== Configure Kaggle CLI ========
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# ======== Install Kaggle ========
!pip install kaggle

# ======== Download Primary Dataset ========
!kaggle datasets download -d sahideseker/online-shopping-abandonment-prediction

# ======== Unzip ========
!unzip online-shopping-abandonment-prediction.zip -d dataset


Dataset URL: https://www.kaggle.com/datasets/sahideseker/online-shopping-abandonment-prediction
License(s): CC-BY-SA-4.0
Downloading online-shopping-abandonment-prediction.zip to /content
  0% 0.00/8.78k [00:00<?, ?B/s]
100% 8.78k/8.78k [00:00<00:00, 26.8MB/s]
Archive:  online-shopping-abandonment-prediction.zip
  inflating: dataset/shopping_abandonment.csv  


In [4]:
import pandas as pd

# Load the dataset (adjust filename if needed)
df = pd.read_csv('/content/dataset/shopping_abandonment.csv')

print("===== DATA HEAD =====")
display(df.head())

print("===== DATA INFO =====")
print(df.info())

print("===== DATA DESCRIBE =====")
display(df.describe(include='all'))


===== DATA HEAD =====


Unnamed: 0,session_id,pages_visited,time_on_site,cart_value,abandoned
0,SID1,8,548,23.49,0
1,SID2,13,226,112.2,0
2,SID3,18,472,214.7,1
3,SID4,13,529,294.08,0
4,SID5,9,936,293.16,1


===== DATA INFO =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   session_id     1000 non-null   object 
 1   pages_visited  1000 non-null   int64  
 2   time_on_site   1000 non-null   int64  
 3   cart_value     1000 non-null   float64
 4   abandoned      1000 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 39.2+ KB
None
===== DATA DESCRIBE =====


Unnamed: 0,session_id,pages_visited,time_on_site,cart_value,abandoned
count,1000,1000.0,1000.0,1000.0,1000.0
unique,1000,,,,
top,SID1000,,,,
freq,1,,,,
mean,,10.276,518.687,153.67911,0.31
std,,5.438695,282.808181,86.05114,0.462725
min,,1.0,34.0,10.29,0.0
25%,,6.0,262.75,73.53,0.0
50%,,10.0,512.0,153.99,0.0
75%,,15.0,767.0,227.095,1.0


In [5]:
# Select relevant columns for modeling
df_model = df[['session_id', 'pages_visited', 'time_on_site', 'cart_value', 'abandoned']].copy()

# Ensure 'abandoned' is numeric
df_model['abandoned'] = df_model['abandoned'].astype(int)

print("===== CLASS DISTRIBUTION =====")
print(df_model['abandoned'].value_counts(normalize=True))


===== CLASS DISTRIBUTION =====
abandoned
0    0.69
1    0.31
Name: proportion, dtype: float64


In [6]:
from sklearn.model_selection import train_test_split

X = df_model.drop(['session_id', 'abandoned'], axis=1)
y = df_model['abandoned']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)

rf.fit(X_train, y_train)


In [8]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf.predict(X_test)

print("===== CONFUSION MATRIX =====")
print(confusion_matrix(y_test, y_pred))

print("\n===== CLASSIFICATION REPORT =====")
print(classification_report(y_test, y_pred))


===== CONFUSION MATRIX =====
[[122  16]
 [ 57   5]]

===== CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

           0       0.68      0.88      0.77       138
           1       0.24      0.08      0.12        62

    accuracy                           0.64       200
   macro avg       0.46      0.48      0.45       200
weighted avg       0.54      0.64      0.57       200



In [9]:
import numpy as np

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
features = X_train.columns

print("\n===== FEATURE IMPORTANCES =====")
for i in indices:
    print(f"{features[i]}: {importances[i]:.4f}")



===== FEATURE IMPORTANCES =====
time_on_site: 0.4186
cart_value: 0.4122
pages_visited: 0.1692


In [10]:
results = {
    'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
    'classification_report': classification_report(y_test, y_pred, output_dict=True),
    'feature_importances': {features[i]: float(importances[i]) for i in indices}
}

import json
with open('primary_model_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved to primary_model_results.json")


Results saved to primary_model_results.json
