# 用户特征工程

In [1]:
import pandas as pd
import numpy as np

In [2]:
user_features = pd.read_csv('../dataset/comment_dataset.csv')
user_features.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,site_idx,ip,tourist_type,likeType,targets,priceSensitive,attention,label
0,0,0,0,1,上海,朋友出游,4|6,1|8|3,0,3|0|2|7,1
1,1,1,1,1,辽宁,朋友出游,4|6,1|8,1,5|7,1
2,2,2,2,1,江苏,家庭亲子,4,1|2|6,1,5|3,1
3,3,3,3,1,新疆,情侣夫妻,4,1|2|5,1,7|5,1
4,4,4,4,1,北京,家庭亲子,4,1|2,1,4,1


In [3]:
site_idx_columns = user_features['site_idx'].copy()
site_idx_columns.head()

0    1
1    1
2    1
3    1
4    1
Name: site_idx, dtype: int64

In [5]:
cur_site_features_count = site_idx_columns.value_counts().sort_index()
cur_site_features_count.describe()

count    950.000000
mean      58.795789
std        3.852166
min       41.000000
25%       60.000000
50%       60.000000
75%       60.000000
max       60.000000
Name: count, dtype: float64

# ip 特征处理
> 这里暂时使用 ip 代替城市特征

In [8]:
province_dict = {
    0: "未知",
    1: "河北",
    2: "山西",
    3: "辽宁",
    4: "吉林",
    5: "黑龙江",
    6: "江苏",
    7: "浙江",
    8: "安徽",
    9: "福建",
    10: "江西",
    11: "山东",
    12: "河南",
    13: "湖北",
    14: "湖南",
    15: "广东",
    16: "海南",
    17: "四川",
    18: "贵州",
    19: "云南",
    20: "陕西",
    21: "甘肃",
    22: "青海",
    23: "台湾",
    24: "内蒙古",
    25: "广西",
    26: "西藏",
    27: "宁夏",
    28: "新疆",
    29: "北京",
    30: "天津",
    31: "上海",
    32: "重庆",
    33: "香港",
    34: "澳门"
}

reversed_province_list = { v:k for k, v in province_dict.items() }
reversed_province_list

{'未知': 0,
 '河北': 1,
 '山西': 2,
 '辽宁': 3,
 '吉林': 4,
 '黑龙江': 5,
 '江苏': 6,
 '浙江': 7,
 '安徽': 8,
 '福建': 9,
 '江西': 10,
 '山东': 11,
 '河南': 12,
 '湖北': 13,
 '湖南': 14,
 '广东': 15,
 '海南': 16,
 '四川': 17,
 '贵州': 18,
 '云南': 19,
 '陕西': 20,
 '甘肃': 21,
 '青海': 22,
 '台湾': 23,
 '内蒙古': 24,
 '广西': 25,
 '西藏': 26,
 '宁夏': 27,
 '新疆': 28,
 '北京': 29,
 '天津': 30,
 '上海': 31,
 '重庆': 32,
 '香港': 33,
 '澳门': 34}

In [9]:
def ip_to_idx(ip_name) -> int:
    if isinstance(ip_name, str):
        if ip_name in reversed_province_list:
            return reversed_province_list[ip_name]
        else: 
            return 0
    else:
        return 0

user_ip_columns = user_features['ip'].map(ip_to_idx)
user_ip_columns.head()

0    31
1     3
2     6
3    28
4    29
Name: ip, dtype: int64

In [None]:
# 用户城市统计
user_ip_count = user_ip_columns.value_counts().sort_index()
user_ip_count

ip
0     16941
1      1120
2      1104
3       828
4       366
5       859
6      2808
7      1893
8       846
9      1340
10      697
11     1977
12     1741
13     1090
14     1110
15     2906
16     1020
17     2265
18      828
19     1979
20     1489
21      549
22      274
24      705
25     1231
26      179
27      302
28      810
29     2380
30      574
31     2759
32      886
Name: count, dtype: int64

## 出游类型分析

出游类型(单标签):
    0 其他出游     1308 + NaN
    1 单独旅行     4772
    2 商务出差       18
    4 家庭亲子    16182
    5 情侣夫妻     7140
    6 朋友出游    10630
    7 陪同父母     2254

In [11]:
tourist_type_dict = {
     "其他出游": 0,     
     "单独旅行": 1,    
     "商务出差": 2,     
     "家庭亲子": 3,    
     "情侣夫妻": 4,     
     "朋友出游": 5,    
     "陪同父母": 6     
}

def tourist_type_to_idx(tourist_type) -> int:
    if isinstance(tourist_type, str):
        if tourist_type in tourist_type_dict:
            return tourist_type_dict[tourist_type]
        else:
            return 0
    else:
        return 0

In [12]:
tourist_type_columns = user_features['tourist_type'].copy()
tourist_type_columns.head()

0    朋友出游
1    朋友出游
2    家庭亲子
3    情侣夫妻
4    家庭亲子
Name: tourist_type, dtype: object

In [13]:
tourist_type_columns = tourist_type_columns.map(tourist_type_to_idx)
tourist_type_columns.head()

0    5
1    5
2    3
3    4
4    3
Name: tourist_type, dtype: int64

In [14]:
tourist_type_count = tourist_type_columns.value_counts().sort_index()
tourist_type_count

tourist_type
0    25142
1     3277
2        9
3    13122
4     4807
5     7333
6     2166
Name: count, dtype: int64

## 喜欢景点类型

景点喜好类型:
    0 亲子同乐
    1 观光游览
    2 夜游观景
    3 自然风光
    4 名胜古迹
    5 户外活动
    6 展馆展览
    7 动植物园
    8 冬季滑雪
    9 主题乐园
    10 体闲娱乐
    11 温泉泡汤
    12 水上活动
    13 空中体验

In [42]:
like_type_columns = user_features['likeType'].copy()
like_type_columns.head()

0    4|6
1    4|6
2      4
3      4
4      4
Name: likeType, dtype: object

In [43]:
import math
from typing import List

def splitedStr_to_list(key) -> List[int]:
    # 1. 先处理缺失值（NaN / None / <NA>）
    if pd.isna(key):
        return [10]

    # 2. 整数：Python int / numpy int 都可以
    if isinstance(key, int):
        return [key]

    # 3. 浮点：排除 NaN 后，转成 int
    if isinstance(key, float):
        # 这里再防一下 NaN，虽然上面 pd.isna 已经处理过了
        if math.isnan(key):
            return [10]
        return [int(key)]

    # 4. 字符串：可能是 "1|2|3"、"1|2|3.0" 或 "3.0"
    if isinstance(key, str):
        result = []
        for t in key.split("|"):
            t = t.strip()
            if t == "":
                continue
            # 这里允许 "3.0" 之类：先转 float，再转 int
            try:
                num = float(t)
                result.append(int(num))
            except ValueError:
                # 确实是不能解析的字符串，你可以选择：
                # 1) 跳过，
                # 2) 用默认值 10，
                # 3) 或者直接抛错。
                # 这里示例：用默认值 10
                result.append(10)
        return result

    # 5. 其它没想到的类型，你可以按需要处理
    raise TypeError(f"Unexpected type: {type(key)} value={key!r}")

In [44]:
like_type_columns = like_type_columns.map(splitedStr_to_list)
like_type_columns.head()

0    [4, 6]
1    [4, 6]
2       [4]
3       [4]
4       [4]
Name: likeType, dtype: object

In [45]:
like_type_flat = like_type_columns.explode()
like_type_flat.head()

0    4
0    6
1    4
1    6
2    4
Name: likeType, dtype: object

In [46]:
like_type_counts = like_type_flat.value_counts().sort_index()
like_type_counts

likeType
0       724
1      7662
2      2848
3     21019
4     20825
5      4816
6      6655
7      4634
8       471
9      4137
10    14368
11      378
12     1862
13      256
Name: count, dtype: int64

In [48]:
like_type_columns[like_type_columns.isna()].index

Index([], dtype='int64')

## 出游动机

出游动机 (target，选择 1 - 6 个标签，对应编号为整数列表):
    0 其他
    1 历史文化溯源
    2 自然景观观赏
    3 亲子遛娃互动
    4 主题乐园狂欢
    5 城市地标打卡
    6 休闲度假放松
    7 网红地标打卡
    8 文化艺术体验
    9 户外探险猎奇
    10 家庭团聚出游
    11 治愈系散心
    12 节庆主题体验

In [58]:
tourist_target_columns = user_features['targets'].copy()
tourist_target_columns.head()

0    1|8|3
1      1|8
2    1|2|6
3    1|2|5
4      1|2
Name: targets, dtype: object

In [59]:
def splitedStr_to_list_1(key) -> List[int]:
    # 1. 先处理缺失值（NaN / None / <NA>）
    if pd.isna(key):
        return [0]

    # 2. 整数：Python int / numpy int 都可以
    if isinstance(key, int):
        return [key]

    # 3. 浮点：排除 NaN 后，转成 int
    if isinstance(key, float):
        # 这里再防一下 NaN，虽然上面 pd.isna 已经处理过了
        if math.isnan(key):
            return [0]
        return [int(key)]

    # 4. 字符串：可能是 "1|2|3"、"1|2|3.0" 或 "3.0"
    if isinstance(key, str):
        result = []
        for t in key.split("|"):
            t = t.strip()
            if t == "":
                continue
            # 这里允许 "3.0" 之类：先转 float，再转 int
            try:
                num = float(t)
                result.append(int(num))
            except ValueError:
                # 确实是不能解析的字符串，你可以选择：
                # 1) 跳过，
                # 2) 用默认值 10，
                # 3) 或者直接抛错。
                # 这里示例：用默认值 10
                result.append(0)
        return result

    # 5. 其它没想到的类型，你可以按需要处理
    raise TypeError(f"Unexpected type: {type(key)} value={key!r}")

In [60]:
tourist_target_columns = tourist_target_columns.map(splitedStr_to_list_1)
tourist_target_columns.head()

0    [1, 8, 3]
1       [1, 8]
2    [1, 2, 6]
3    [1, 2, 5]
4       [1, 2]
Name: targets, dtype: object

In [61]:
tourist_target_flat = tourist_target_columns.explode()
tourist_target_flat.head()

0    1
0    8
0    3
1    1
1    8
Name: targets, dtype: object

In [62]:
tourist_target_count = tourist_target_flat.value_counts().sort_index()
tourist_target_count

targets
0     11653
1     15653
2     16185
3      4218
4      1872
5      9393
6     10160
7      2837
8      4616
9      3256
10     2085
11     2428
12      159
Name: count, dtype: int64

## 价格敏感与否

价格敏感与否 (priceSensitive，选择 0 或 1，对应单个整数):
    0 价格敏感型
    1 价格不敏感

In [69]:
import ast
import pandas as pd
from typing import Any

def price_to_idx(key: Any) -> int:
    # 1. 缺失值：NaN / None / <NA>
    if pd.isna(key):
        # 看你业务怎么定义，这里举例给个默认值 -1
        return 1

    # 2. 已经是 int，直接返回
    if isinstance(key, int):
        return key

    # 3. float（排除 NaN 已在前面做过）
    if isinstance(key, float):
        return int(key)

    # 4. list，比如 [1] / [1.0] / ['1']
    if isinstance(key, list):
        if not key:
            return 1  # 空列表时你自己决定用什么
        first = key[0]
        # 递归利用本函数，把第一个元素再喂回来
        return price_to_idx(first)

    # 5. 字符串
    if isinstance(key, str):
        s = key.strip()

        # 5.1 像 "[1]" 这种列表字符串：先尝试用 ast.literal_eval 解析
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                return price_to_idx(parsed)
            except Exception:
                # 解析失败再尝试按普通数字处理
                pass

        # 5.2 普通数字字符串，可能是 "1" 或 "1.0"
        try:
            # 如果是 "1.0" 先转 float 再 int
            return int(float(s))
        except ValueError:
            # 实在不行，再给一个默认值或抛错
            # raise ValueError(f"无法解析价格字段: {key!r}")
            return 1

    # 6. 其它意料之外的类型
    # 你也可以在这里打印出来看看，然后抛错或给默认值
    raise TypeError(f"Unexpected type: {type(key)} value={key!r}")
    # return -1

In [70]:
price_sensitive_columns = user_features['priceSensitive'].copy()
price_sensitive_columns.head()

0    0
1    1
2    1
3    1
4    1
Name: priceSensitive, dtype: object

In [71]:
price_sensitive_columns = price_sensitive_columns.map(price_to_idx)
price_sensitive_columns.head()

0    0
1    1
2    1
3    1
4    1
Name: priceSensitive, dtype: int64

In [72]:
price_sensitive_count = price_sensitive_columns.value_counts().sort_index()
price_sensitive_count

priceSensitive
0    36844
1    19012
Name: count, dtype: int64

## 关注点

体验关注细节 (attention，多标签):
    0 排队效率敏感
    1 设备完善度敏感
    2 服务质量敏感
    3 行程规划偏好
    4 舒适度敏感
    5 导览体验敏感
    6 消费透明敏感
    7 拍照出片敏感

In [90]:
def splitedStr_to_list_2(key) -> List[int]:
    # 1. 先处理缺失值（NaN / None / <NA>）
    if pd.isna(key):
        return [4]

    # 2. 整数：Python int / numpy int 都可以
    if isinstance(key, int):
        return [key]

    # 3. 浮点：排除 NaN 后，转成 int
    if isinstance(key, float):
        # 这里再防一下 NaN，虽然上面 pd.isna 已经处理过了
        if math.isnan(key):
            return [4]
        return [int(key)]

    # 4. 字符串：可能是 "1|2|3"、"1|2|3.0" 或 "3.0"
    if isinstance(key, str):
        result = []
        for t in key.split("|"):
            t = t.strip()
            if t == "":
                continue
            # 这里允许 "3.0" 之类：先转 float，再转 int
            try:
                num = float(t)
                result.append(int(num))
            except ValueError:
                # 确实是不能解析的字符串，你可以选择：
                # 1) 跳过，
                # 2) 用默认值 10，
                # 3) 或者直接抛错。
                # 这里示例：用默认值 10
                result.append(4)
        return result

    # 5. 其它没想到的类型，你可以按需要处理
    raise TypeError(f"Unexpected type: {type(key)} value={key!r}")

In [91]:
attention_columns = user_features['attention'].copy()
attention_columns.head()

0    3|0|2|7
1        5|7
2        5|3
3        7|5
4          4
Name: attention, dtype: object

In [92]:
attention_columns = attention_columns.map(splitedStr_to_list_2)

def del_invalid_element(key: list[int]):
    for idx in range(len(key)):
        if not (0 <= key[idx] <= 7):
            key[idx] = 4
    return key

attention_columns = attention_columns.map(del_invalid_element)
attention_columns.head()

0    [3, 0, 2, 7]
1          [5, 7]
2          [5, 3]
3          [7, 5]
4             [4]
Name: attention, dtype: object

In [93]:
attention_flat = attention_columns.explode()
attention_flat.head()

0    3
0    0
0    2
0    7
1    5
Name: attention, dtype: object

In [94]:
attention_counts = attention_flat.value_counts().sort_index()
attention_counts

attention
0    13737
1     7570
2    11084
3     6625
4    30545
5     7777
6    12431
7    11007
Name: count, dtype: int64

In [95]:
attention_columns.head()

0    [3, 0, 2, 7]
1          [5, 7]
2          [5, 3]
3          [7, 5]
4             [4]
Name: attention, dtype: object

## 标签值

In [96]:
label_columns = user_features['label'].copy()
label_columns.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [100]:
label_count = label_columns.value_counts().sort_index()
label_count

label
0    17856
1    38000
Name: count, dtype: int64

## 数据集

In [101]:
user_final_features = pd.DataFrame({
    "site_idx": site_idx_columns,
    "address": user_ip_columns,
    "tourist_type": tourist_type_columns,
    "like_type": like_type_columns,
    "targets": tourist_target_columns,
    "price_sensitive": price_sensitive_columns,
    "attention": attention_columns,
    "label": label_columns
})

In [105]:
user_final_features

Unnamed: 0,site_idx,address,tourist_type,like_type,targets,price_sensitive,attention,label
0,1,31,5,"[4, 6]","[1, 8, 3]",0,"[3, 0, 2, 7]",1
1,1,3,5,"[4, 6]","[1, 8]",1,"[5, 7]",1
2,1,6,3,[4],"[1, 2, 6]",1,"[5, 3]",1
3,1,28,4,[4],"[1, 2, 5]",1,"[7, 5]",1
4,1,29,3,[4],"[1, 2]",1,[4],1
...,...,...,...,...,...,...,...,...
55851,1000,9,3,[1],[7],1,"[0, 6]",0
55852,1000,9,3,[1],[7],0,[2],0
55853,1000,9,5,[1],[7],0,"[0, 3]",0
55854,1000,15,3,[1],[7],0,"[1, 2]",0


In [106]:
user_final_features.to_csv('../dataset/user_features.csv')

In [103]:
site_final_features = pd.read_pickle('../dataset/site_features.pkl')
site_final_features

Unnamed: 0,site_idx,score,hot_degree,address,introduce_embed,price,positive_comment_rate
0,1,0.96,1.00,29,"[-0.037725143134593964, -0.12439101189374924, ...",0.000000,0.982445
1,2,0.94,1.00,29,"[-0.038046449422836304, -0.18526966869831085, ...",0.039503,0.971700
2,3,0.96,1.00,31,"[0.03834586590528488, -0.056213535368442535, 0...",0.000000,0.994827
3,4,0.94,1.00,20,"[0.009396699257194996, -0.23967264592647552, 0...",0.135440,0.957264
4,5,0.94,1.00,33,"[-0.046439655125141144, -0.17700868844985962, ...",0.603837,0.973052
...,...,...,...,...,...,...,...
995,996,0.86,0.68,7,"[-0.09245417267084122, -0.0648542121052742, 0....",0.056433,0.917892
996,997,0.92,0.68,6,"[0.023274630308151245, -0.08117324113845825, 0...",0.178330,0.976170
997,998,0.88,0.68,2,"[-0.045759472995996475, -0.16594219207763672, ...",0.000000,0.912214
998,999,0.92,0.68,6,"[0.06767657399177551, -0.07898690551519394, 0....",0.428894,0.971014


In [104]:
like_type_len = like_type_columns.map(lambda x: len(x))
like_type_len.value_counts()

likeType
1    29715
2    18907
3     6034
4     1024
5      128
6       48
Name: count, dtype: int64

In [107]:
target_len = tourist_target_columns.map(lambda x: len(x))
target_len.value_counts()

targets
1    32220
2    19249
3     3850
4      470
5       35
6       32
Name: count, dtype: int64

In [108]:
attention_len = attention_columns.map(lambda x: len(x))
attention_len.value_counts()

attention
2    23467
1    22402
3     8527
4     1441
5       19
Name: count, dtype: int64