In [1]:
import pandas as pd
import numpy as np
import datetime
import random
df = pd.read_csv ('2019-Dec.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3533286 entries, 0 to 3533285
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 242.6+ MB


In [3]:
df.describe()

Unnamed: 0,product_id,category_id,price,user_id
count,3533286.0,3533286.0,3533286.0,3533286.0
mean,5473054.0,1.555023e+18,8.871856,522331800.0
std,1331331.0,1.689262e+17,19.86474,84948190.0
min,3752.0,1.48758e+18,-79.37,1180452.0
25%,5726191.0,1.48758e+18,2.06,486683000.0
50%,5811429.0,1.48758e+18,4.21,556649600.0
75%,5859462.0,1.48758e+18,7.14,582801900.0
max,5917178.0,2.235524e+18,327.78,595414500.0


In [4]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,f.o.x,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,cnd,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,runail,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,freedecor,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5


In [2]:
df = df.drop(columns=['brand', 'user_session', 'category_id'])

df["event_time"] = df["event_time"].str.replace(" UTC", "")
df['event_time'] = pd.to_datetime(df['event_time'], format='%Y-%m-%d %H:%M:%S')
df['event_time'] = df['event_time'].apply(lambda x: x.replace(year=2023, month=5))

start_time = df['event_time'].min()
end_time = df['event_time'].max()
data_duration = (end_time - start_time).days
print(f"Data collected for {data_duration} days")
print(f"Start time: {start_time}")
print(f"End time: {end_time}")
print(f"Unique users: {df['user_id'].nunique()}")
print(f"Unique products: {df['product_id'].nunique()}")

Data collected for 30 days
Start time: 2023-05-01 00:00:00
End time: 2023-05-31 23:59:57
Unique users: 370154
Unique products: 44624


In [3]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_code,price,user_id
0,2023-05-01 00:00:00,remove_from_cart,5712790,,6.27,576802932
1,2023-05-01 00:00:00,view,5764655,,29.05,412120092
2,2023-05-01 00:00:02,cart,4958,,1.19,494077766
3,2023-05-01 00:00:05,view,5848413,,0.79,348405118
4,2023-05-01 00:00:07,view,5824148,,5.56,576005683


In [None]:
unique_user_ids = df['user_id'].unique()

age_values = list(range(18, 33))  
age_weights = [random.randint(1, 100) for _ in range(len(age_values))]
total_age_weights = sum(age_weights)
normalized_age_weights = [age_weight / total_age_weights for age_weight in age_weights]

user_age_mapping = {user_id: random.choices(age_values, weights=normalized_age_weights)[0] for user_id in unique_user_ids}
df['age'] = df['user_id'].map(user_age_mapping)

In [None]:
unique_user_ids = df['user_id'].unique()

sex = ["male", "female"]
s_weights = [random.randint(1, 100) for _ in range(len(sex))]
s_total_weights = sum(s_weights)
s_normalized_weights = [s_weight / s_total_weights for s_weight in s_weights]

user_sex_mapping = {user_id: random.choices(sex, weights=s_normalized_weights)[0] for user_id in unique_user_ids}
df['sex'] = df['user_id'].map(user_sex_mapping)

In [None]:
with open("categories.txt", "r") as file:
    categories = [line.strip() for line in file.readlines()]

product_category_mapping = {product_id: random.choice(categories) for product_id in df['product_id'].unique()}

df['category_code'] = df['product_id'].map(product_category_mapping)

In [9]:
def generate_price():
    integer_part = random.randint(1, 59)
    decimal_part = random.choice([9, 19, 29, 39, 49, 59, 69, 79, 89, 99])
    return float(f"{integer_part}.{decimal_part}")

unique_product_ids = df['product_id'].unique()

price_mapping = {product_id: generate_price() for product_id in unique_product_ids}
df['price'] = df['product_id'].map(price_mapping)

In [11]:
unique_product_ids = df['product_id'].unique()

random_product_ids = np.random.choice(unique_product_ids, 352, replace=False)

df = df[df['product_id'].isin(random_product_ids)]

print(f"Unique users: {df['user_id'].nunique()}")
print(f"Unique products: {df['product_id'].nunique()}")
print(f"Event types: {df['event_type'].value_counts()}")

Unique users: 1051
Unique products: 352
Event types: view                1131
cart                 640
remove_from_cart     420
purchase             139
Name: event_type, dtype: int64


In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2330 entries, 2421 to 3530399
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   event_time     2330 non-null   datetime64[ns]
 1   user_id        2330 non-null   int64         
 2   sex            2330 non-null   object        
 3   age            2330 non-null   int64         
 4   event_type     2330 non-null   object        
 5   product_id     2330 non-null   int64         
 6   category_code  2330 non-null   object        
 7   price          2330 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(3)
memory usage: 163.8+ KB
None


In [13]:
df = df[[ 'event_time', 'user_id', 'sex', 'age', 'event_type', 'product_id', 'category_code', 'price']]

In [14]:
df.head()

Unnamed: 0,event_time,user_id,sex,age,event_type,product_id,category_code,price
2421,2023-05-01 01:18:47,442277715,male,27,remove_from_cart,5877599,Piercings.StretchingTools,54.39
2422,2023-05-01 01:18:47,442277715,male,27,remove_from_cart,5877599,Piercings.StretchingTools,54.39
10067,2023-05-01 05:59:03,579614811,male,26,view,5855726,Jewellery.ToeRings,49.99
12635,2023-05-01 06:40:49,580084956,female,28,view,5823775,Piercings.BallsPins&More,39.99
13335,2023-05-01 06:53:12,432489049,female,20,view,5885311,Piercings.PiercingRings,17.99


In [18]:
df.to_csv('hoole.csv', index=False)