In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.options.display.float_format = '{:,}'.format

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dtype = {"user_id":"uint32",
         "product_id":"uint32",
         "event_type":"category",
         "category_code":"category",
         "brand":"category",
         "user_session":"category"}

file = '/kaggle/input/ecommerce-behavior-data-from-multi-category-store/2019-Nov.csv'

df = pd.read_csv(file, dtype= dtype)
df

In [None]:
df.info()

In [None]:
df.describe()


Data Cleaning

In [None]:
# converting event time to date and hour only
df.insert(loc= 1,column ='date_utc' ,value = pd.to_datetime(df['event_time'].apply(lambda s: str(s)[0:10])))
df.insert(loc= 2,column ='hour' ,value = (df['event_time'].apply(lambda s: str(s)[11:13])).astype('uint8'))
df = df.drop('event_time', axis=1)
df

In [None]:
# visualizing null values
msno.bar(df)
plt.show()
df = df.fillna('unknown')

In [None]:
# we're gonna remove items with 'purchase' event type in our dataframe
purchase = df[df['event_type'] == 'purchase']

# removing purchase data from original data then dropping duplicates
df = df[df['event_type'] != 'purchase'].drop_duplicates()

# concatenating both data
df = pd.concat([df, purchase], ignore_index=True)
df

Visualization

In [None]:
visitors = (df.groupby('event_type')
            ['user_id'].agg(['count'])
            .sort_values(by='count', ascending=False)
            .rename(columns={"count":"count_of_users"})
            .reset_index())
visitors['prcnt'] = (100 * visitors['count_of_users'] / visitors['count_of_users'].sum()).round(1)
print(visitors)

px.pie(visitors, values='prcnt', names='event_type' ,template='plotly_dark')

In [None]:
# engagement of customers towards the store (view, cart, purchase) count
activity = df.groupby(['date_utc','hour'])['user_id'].agg(['count']).reset_index().sort_values(by=['date_utc','hour'])
activity['time'] = pd.to_datetime(activity['hour'], format='%H').dt.strftime('%I %p')
activity['week_day'] = activity['date_utc'].dt.day_name()
activity['day'] = activity['date_utc'].dt.day_of_week
activity

In [None]:
month_activity = activity.groupby('date_utc')['count'].agg(['sum']).reset_index()
fig = px.bar(month_activity, x='date_utc', y='sum', title="Sum of Users Over the Month", template='plotly_dark')
fig.update_xaxes(tickmode='linear')
fig.show()

In [None]:
week_activity = activity.groupby(['week_day', 'day'])['count'].agg(['mean']).round().astype('uint32').rename(columns={'mean':'average_user_by_day'}).sort_values(by='day').reset_index()
fig1 = px.line(week_activity, x='week_day', y='average_user_by_day', text='average_user_by_day', title="Average User Count by Day", template='plotly_dark')
fig1.update_xaxes(type='category')
fig1.show()

In [None]:
time_activity = activity.groupby(['hour', 'time'])['count'].agg(['mean']).round().astype('uint32').rename(columns={'mean':'average_users_by_hour'}).reset_index()
fig2 = px.line(time_activity, x='time', y='average_users_by_hour', title="Average User Count by Hour", template='plotly_dark')
fig2.update_xaxes(tickmode='linear', type='category')
fig2.show()