In [3]:
from functools import partial
import json
import os

import numpy as np
import pandas as pd

%matplotlib inline
from collections import Counter

In [5]:
base_folder = "Data"
result_folder = "Result"

In [3]:
item_features = pd.read_csv(f"{base_folder}/item_features.csv")

In [4]:
first_level_category = item_features[[ 'first_level_category_id', 'first_level_category_name_en']]
first_level_category = first_level_category.sort_values(by='first_level_category_id')


In [5]:
item_feature_pairs = first_level_category.apply(tuple, axis=1)
item_counts = Counter(item_feature_pairs)
for (item_id, item), count in item_counts.items():
    print(f'ID: {item_id}, Item: {item}, Count: {count}')

ID: 0, Item: empty, Count: 302861
ID: 1, Item: dance, Count: 85907
ID: 2, Item: music, Count: 185197
ID: 3, Item: game, Count: 310777
ID: 4, Item: Beauty Makeup, Count: 75264
ID: 5, Item: Style, Count: 165759
ID: 6, Item: Star Entertainment, Count: 198103
ID: 7, Item: motion, Count: 85472
ID: 8, Item: Appearance value, Count: 71495
ID: 9, Item: Funny, Count: 142102
ID: 10, Item: Travel, Count: 33930
ID: 11, Item: life, Count: 149404
ID: 12, Item: Food, Count: 207102
ID: 13, Item: San Nong, Count: 96467
ID: 14, Item: education, Count: 54351
ID: 15, Item: Talent, Count: 121431
ID: 16, Item: health, Count: 59899
ID: 17, Item: animal, Count: 65251
ID: 18, Item: automobile, Count: 93557
ID: 19, Item: emotion, Count: 149993
ID: 20, Item: pixiv, Count: 295550
ID: 21, Item: history, Count: 13157
ID: 22, Item: Finance and Economics, Count: 16543
ID: 23, Item: religion, Count: 11702
ID: 24, Item: Astrological numerology, Count: 19039
ID: 25, Item: Parent-child, Count: 128276
ID: 26, Item: photog

In [9]:
item_counts_df = pd.DataFrame(item_counts.items(), columns=['first_level_category', 'count'])

item_counts_df[['id', 'tag']] = pd.DataFrame(item_counts_df['first_level_category'].tolist(), index=item_counts_df.index)

item_counts_df[['id', 'tag', 'count']].to_csv(f"{result_folder}/Category-ID-Count.csv", index=False)

In [6]:
src_inter = pd.read_csv(f"{base_folder}/src_inter.csv")

In [16]:
search_source = src_inter[ 'search_source']
search_source_counts = Counter(search_source)

for item, count in search_source_counts.items():
    print(f'{item}: {count}')

0            TRENDING
1            TRENDING
2            TRENDING
3            TRENDING
4            TRENDING
              ...    
3171226    USER_INPUT
3171227    USER_INPUT
3171228    USER_INPUT
3171229    USER_INPUT
3171230    USER_INPUT
Name: search_source, Length: 3171231, dtype: object


In [17]:
search_source_counts = Counter(search_source)

for item, count in search_source_counts.items():
    print(f'{item}: {count}')

TRENDING: 269705
SUGGESTION: 950107
PLACEHOLDER: 104308
USER_INPUT: 809013
RELATED_FEED_QUERY: 166621
HISTORY: 526156
UNKNOWN_PAGE: 345321


In [9]:
item_type = src_inter[ 'item_type']
item_type_counts = Counter(item_type)

for item, count in item_type_counts.items():
    print(f'{item}: {count}')

IMAGE_ATLAS: 527535
VIDEO: 2275388
ADVERT: 114856
USER: 87318
MUSIC: 2395
LIVE: 47997
COMMODITY: 34374
UNKNOWN: 81368


In [19]:
print(src_inter['search_session_time'].head(10))

0    2023-05-28 01:14:39
1    2023-05-28 01:14:39
2    2023-05-28 01:14:39
3    2023-05-28 01:14:39
4    2023-05-28 01:14:39
5    2023-05-25 21:56:29
6    2023-05-25 21:56:29
7    2023-05-25 21:56:29
8    2023-05-25 21:56:29
9    2023-05-25 21:56:29
Name: search_session_time, dtype: object


In [12]:
user_variety = src_inter.groupby('user_id')['search_source'].nunique().reset_index()
user_variety.columns = ['user_id', 'variety']
print(user_variety)


       user_id  variety
0            1        4
1            2        3
2            3        7
3            4        7
4            5        4
...        ...      ...
25872    25873        1
25873    25874        5
25874    25875        3
25875    25876        7
25876    25877        2

[25877 rows x 2 columns]


In [13]:
user_features = pd.read_csv(f"{base_folder}/user_features.csv")

In [15]:
print(user_features.head(10))


   user_id  onehot_feat1  onehot_feat2  search_active_level  reco_active_level
0     7562             4             1                    3                  3
1    19630             5             1                    0                  2
2     4039             3             1                    0                  0
3    19995             4             1                    5                  3
4    22234             3             2                    4                  2
5     6504             4             1                    0                  2
6    17516             3             2                    4                  3
7     8267             2             2                    3                  2
8    23414             3             1                    3                  0
9     9413             5             1                    0                  2
