In [41]:
import pandas as pd
import jieba
import re
import warnings
from chinese_province_city_area_mapper.transformer import CPCATransformer

In [42]:
# 读取数据
df1 = pd.read_csv('comments.csv', names=['name', 'score', 'comment', 'date', 'href'])
df2 = pd.read_csv('cities.csv', names=['city'])
df = pd.merge(df1, df2, left_index=True, right_index=True, how='outer') # 根据索引合并数据

df.drop('href', axis=1, inplace=True) # 去掉href列
df.drop_duplicates(subset=None, keep='first', inplace=True) # 去重（这里没有重复值）
df.dropna(axis=0) # 删除空值 (这里没有空值)

# 去掉comment的span标签
def comment_process(comment):
    comment = comment.strip('<span class="short">').strip('</span>').replace('\n', '').replace('\r', '')
    p = re.compile('[^\u4e00-\u9fa5]')  # 中文编码范围\u4e00到\u9fa5
    comment = re.sub(p,'',comment)
    return comment

df['comment'] = df['comment'].apply(comment_process) # 使用apply比循环要快

# 评分转换数字
df['score1'] = df['score']
df['score'] = df['score'].map({
    '力荐': 5,
    '推荐': 4,
    '还行': 3,
    '较差': 2,
    '很差': 1
})
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')  # 将datetime字段由object转换成datetime类型，速度回快很多

# 处理城市数据，如'讷河, 齐齐哈尔'提取为齐齐哈尔，'江苏南京'提取为南京，同时去除国外城市
def city_process(line):
    city = re.compile('[^\u4e00-\u9fa5]') # 中文编码范围\u4e00到\u9fa5
    # 取出中文字符，返回列表
    zh = re.split(city, line)
    # 取列表中最后一个，例如'讷河, 齐齐哈尔'取齐齐哈尔
    zh = zh[-1]
    return zh

df['city'] = df['city'].apply(city_process)
# 提取出city中的市
cpca = CPCATransformer()
df['city'] = cpca.transform(df.city)['市']

# df1 = df[df['city'] != ''] # 去除城市为空数据
df.replace('北京市', '北京', inplace=True)
df.replace('上海市', '上海', inplace=True)
df.to_csv('data.csv', index=0, encoding='utf-8-sig')

In [51]:
comment_cut = ''
comments = df['comment'].tolist()

for comment in comments:
    comment = jieba.cut(comment)
    comment = ' '.join(comment)
    comment_cut += comment
    
df_comment = pd.DataFrame([{
    'index' : '',
    'comment' : ''
}])
comments = comment_cut.split(' ')
i = 1
for comment in comments:
    insertRow = pd.DataFrame([{
    'index' : str(i),
    'comment' : comment
    }])
    df_comment = pd.concat([df_comment, insertRow], ignore_index=True)
    i += 1

In [58]:
df_comment.drop([0], inplace=True)

In [108]:
count = df_comment['comment'].value_counts()

value = count.index.tolist()
count = count[value].tolist()
df_count = pd.DataFrame({
    'value' : value,
    'count' : count
})

def value_len(value):
    return len(value)

df_count['len'] = df_count['value'].apply(value_len)
df_count = df_count[df_count['len'] > 1]
df_count = df_count.iloc[:30]

df_count.to_csv('count.csv', encoding='utf-8-sig')