In [1]:
import pandas as pd
import numpy as np
import re

### 1. Data Exploring for taptap_reviews.csv

In [2]:
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\taptap_reviews.csv")
df.head(10)

Unnamed: 0,用户ID,用户名,评分,评论内容,点赞数,发布时间,设备型号,游戏名称
0,696432312,。。。,5,可以体验一下，剧情不错，但可能会有点迷,,2025-04-05 20:13,OPPO K7x中国版,7_years_from_now
1,679295528,云淡风轻,5,剧情很好,,2025-04-05 17:35,Honor Play 40 5G,7_years_from_now
2,700357357,qqqqq美女,5,刺激，感受到了友情，亲情，自我，爱慕，传承，等待 ，与纯真,1.0,2025-04-04 22:59,OPPO PGGM10,7_years_from_now
3,402380140,User402380140,5,毋庸置疑的神作<br />第一次回溯时间的是葵啊,2.0,2025-04-03 09:48,Redmi Note 10 Pro,7_years_from_now
4,434114561,不忆往事,5,没有别的可以说，剧情神作,1.0,2025-04-03 01:08,Vivo V2148A,7_years_from_now
5,651407597,哈哈哈,5,好好好,1.0,2025-04-01 22:53,未提供,7_years_from_now
6,611873270,半束流光,5,怎么付款怎么付款怎么付款！！！<br />点支付除了黑屏一小下就没反应了！！！,1.0,2025-04-01 22:50,未提供,7_years_from_now
7,439951893,User439951893,5,不多说,1.0,2025-04-01 20:36,iPad (5th generation),7_years_from_now
8,700099883,空入,5,呜呜呜太好玩了，春人你一定要和葵在一起啊啊啊啊啊啊啊啊啊啊啊啊😭😭😭😭😭😭😭😭😭😭😭😭😭,1.0,2025-04-01 19:51,OPPO Reno 5,7_years_from_now
9,50432703,江牢湿,5,虽然还没有玩完，但我觉得这个游戏做的挺好的，一开始我以为就单纯帮主角春人找回丢失的记忆，但是...,1.0,2025-04-01 01:58,Vivo IQOO Neo9,7_years_from_now


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39994 entries, 0 to 39993
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   用户ID    39994 non-null  int64  
 1   用户名     39992 non-null  object 
 2   评分      39994 non-null  int64  
 3   评论内容    39994 non-null  object 
 4   点赞数     22546 non-null  float64
 5   发布时间    39994 non-null  object 
 6   设备型号    39994 non-null  object 
 7   游戏名称    39994 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 2.4+ MB


In [4]:
df.describe()

Unnamed: 0,用户ID,评分,点赞数
count,39994.0,39994.0,22546.0
mean,402418200.0,3.380107,7.904418
std,243549100.0,1.667131,37.209771
min,1860.0,1.0,1.0
25%,104748400.0,1.0,1.0
50%,459965200.0,4.0,2.0
75%,606684000.0,5.0,5.0
max,711610400.0,5.0,2855.0


In [5]:
# 用户名缺失填充为"未知用户"
df['用户名'] = df['用户名'].fillna('未知用户')
# 点赞数缺失填充为0
df['点赞数'] = df['点赞数'].fillna(0)

In [6]:
df['点赞数'] = df['点赞数'].astype(np.int64)  # 点赞数转整型

In [7]:
# 确保评分在1-5范围内
df['评分'] = df['评分'].clip(1, 5)

In [8]:
# 去除评论中的HTML标签
df['评论内容'] = df['评论内容'].apply(lambda x: re.sub(r'<br\s*/?>', ' ', x))
# 用户名去前后空格
df['用户名'] = df['用户名'].str.strip()

In [9]:
# 5. 设备型号标准化
df['设备型号'] = df['设备型号'].replace('未提供', pd.NA)

In [10]:
# 基于用户ID和发布时间去重
df = df.drop_duplicates(subset=['用户ID', '发布时间'])

In [11]:
df['情感倾向'] = np.where(df['评分'] >= 3, 1, 0)
print(df[['评分', '情感倾向']].value_counts())

评分  情感倾向
5   1       16845
1   0       10191
4   1        5327
3   1        4181
2   0        3441
Name: count, dtype: int64


In [12]:
# 创建列名映射字典
column_mapping = {
    '用户ID': 'user_id',
    '用户名': 'username',
    '评分': 'rating',
    '评论内容': 'review_content',
    '点赞数': 'likes',
    '发布时间': 'publish_time',
    '设备型号': 'device_model',
    '游戏名称': 'game_name',
    '情感倾向': 'sentiment'
}

# 执行列名替换
df = df.rename(columns=column_mapping)

In [13]:
# 用 "unknown" 填充缺失值，明确标识未知设备
df['device_model'] = df['device_model'].fillna('unknown')

In [14]:
# 重置索引
df = df.reset_index(drop=True)

In [15]:
df.head()

Unnamed: 0,user_id,username,rating,review_content,likes,publish_time,device_model,game_name,sentiment
0,696432312,。。。,5,可以体验一下，剧情不错，但可能会有点迷,0,2025-04-05 20:13,OPPO K7x中国版,7_years_from_now,1
1,679295528,云淡风轻,5,剧情很好,0,2025-04-05 17:35,Honor Play 40 5G,7_years_from_now,1
2,700357357,qqqqq美女,5,刺激，感受到了友情，亲情，自我，爱慕，传承，等待 ，与纯真,1,2025-04-04 22:59,OPPO PGGM10,7_years_from_now,1
3,402380140,User402380140,5,毋庸置疑的神作 第一次回溯时间的是葵啊,2,2025-04-03 09:48,Redmi Note 10 Pro,7_years_from_now,1
4,434114561,不忆往事,5,没有别的可以说，剧情神作,1,2025-04-03 01:08,Vivo V2148A,7_years_from_now,1


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39985 entries, 0 to 39984
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         39985 non-null  int64 
 1   username        39985 non-null  object
 2   rating          39985 non-null  int64 
 3   review_content  39985 non-null  object
 4   likes           39985 non-null  int64 
 5   publish_time    39985 non-null  object
 6   device_model    39985 non-null  object
 7   game_name       39985 non-null  object
 8   sentiment       39985 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 2.7+ MB


In [17]:
df.describe()

Unnamed: 0,user_id,rating,likes,sentiment
count,39985.0,39985.0,39985.0,39985.0
mean,402395100.0,3.379992,4.456996,0.659072
std,243556300.0,1.667205,28.214437,0.474027
min,1860.0,1.0,0.0,0.0
25%,104746800.0,1.0,0.0,0.0
50%,459956800.0,4.0,1.0,1.0
75%,606668100.0,5.0,2.0,1.0
max,711610400.0,5.0,2855.0,1.0


In [18]:
df.to_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\cleaned_taptap_reviews.csv", encoding='utf-8-sig', index=False)