In [279]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np
import re
import json
import glob
from bs4 import BeautifulSoup as bs
from functools import reduce
pd.options.display.max_columns = 50

import matplotlib.pyplot as plt
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

import pyecharts
from pyecharts import options as opts

import jieba
from jieba import analyse
stop_words_path = r"C:\\Users\\root\\Desktop\\tmall_crawler\\stopwords-master\\中文停用词表.txt"
analyse.set_stop_words(stop_words_path)
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType

# 天猫眼霜分析报告

## 1. 数据处理

### 1.1 HTML文本解析和内容抽取

In [280]:
# 数据预处理函数

# 解析html文件函数
def parser(file):
    with open(file, 'r', encoding='utf-8') as f:
        html = f.read()
    html = bs(html, 'lxml').text
    try:
        if html.startswith('{'):
            item = json.loads(html)
            return ('json', item)
        else:
            item = re.search(r'g_page_config = ({.*?});', html).group(1)
            item = json.loads(item)
            return ('html', item)
    except Exception as e:
        print(file)
        return None
    
# 清晰json内部数据
# 其中一条来自纯json数据，字段与正常html字段相差太多，强行整合效果和效益不高，故弃之；后期改进
def clean_content(content):
    def filter_keys(x):
        data = {}
        for key in keys:
            data[key] = x[key]
        return data
    
    if content[0] == 'html':
        data = content[1]['mods']['itemlist']['data']['auctions']
        intersection_keys = reduce(lambda x,y: x.intersection(y), [set(item.keys()) for item in data])
        keys = intersection_keys
        data = [filter_keys(x) for x in data]
        return data
    else:
        return None
    
# 操作整合， 最终数据调用函数
def getData(files='./data/item*'):
    files = glob.glob(files)
    df = pd.DataFrame({'file':files})
    df['content'] = df.file.map(parser).map(clean_content)
    df = df[pd.notna(df.content)]
    data = reduce(lambda x,y: x+y ,df.content.tolist())
    df = pd.DataFrame(data)
    return df

### 1.2 字段抽取和数据加工

In [281]:
def clean_data(df):
    # 剔除字段
    drop_columns = ['comment_url', 'icon', 'pic_url', 'shopLink', 'view_fee', 'title', 'shopcard', 'i2iTags', 'p4pTags', 'risk', 'pid']
    df = df.drop(columns=drop_columns)
    df.head()
    ## 提出重复着，经验证，nid是商品id，根据nid去重
    df = df.drop_duplicates(subset=['nid'], keep='first')
    # 查看view_sales缺失情况，根据情况决定是否删除还是按0填补
    # df.view_sales.map(pd.notna).value_counts()
    # 直接剔除view_sales空的商品
    # df[df.category.map(len)<2]
    # 分类category字段只有一条为空的数据，直接剔除
    df =df.dropna(axis=0, subset=['view_sales','category'], how='any')
    def lint(x):
        try: return int(x)
        except: return 0
    def lfloat(x):
        try: return float(x)
        except: return 0.
    # 字段清洗
    df.view_sales = df.view_sales.map(lambda x: re.match(r'([0-9]+)', x).group(1)).map(lint)
    df.comment_count = df.comment_count.map(lint)
    df.view_price = df.view_price.map(lfloat)
    df.item_loc = df.item_loc.map(lambda x: x.split(' ')[0])
#     是否是丸美店铺
    df['is_marubi_shop'] = df.nick.map(lambda x: '丸美' in x)
#     是否是丸美商品
    df['is_marubi_item'] = df.raw_title.map(lambda x: '丸美' in x)
#     标题分词
    df['raw_title_cut'] = df.raw_title.map(jieba.cut).map(list)
    return df


In [282]:
df =getData()
df = clean_data(df)

In [283]:
df.head()

Unnamed: 0,category,nid,comment_count,nick,view_price,raw_title,user_id,item_loc,detail_url,view_sales,is_marubi_shop,is_marubi_item,raw_title_cut
48,121484013,599083945560,17,天猫国际进口超市,615.0,MedSPA/美帕瑞士蛇毒无痕修复眼霜提拉紧致抗皱淡化细纹去脂肪粒,2549841410,浙江,//detail.tmall.com/item.htm?id=599083945560&ns...,2,False,False,"[MedSPA, /, 美帕, 瑞士, 蛇毒, 无痕, 修复, 眼霜, 提拉, 紧致, 抗皱..."
49,121484013,599326258570,1370,newface新面孔旗舰店,76.0,新面孔日夜金眼霜黑眼圈眼袋干细纹抗皱补水正品女三合一脂肪粒,2250956384,浙江,//detail.tmall.com/item.htm?id=599326258570&ns...,583,False,False,"[新面孔, 日夜, 金, 眼霜, 黑眼圈, 眼袋, 干, 细纹, 抗皱, 补水, 正品, 女..."
50,121484013,549317594016,381,珀莱雅铭博专卖店,196.0,珀莱雅弹润芯肌活力眼霜去补水细纹紧致抗皱抗衰老官方旗舰店官网,2778000028,浙江,//detail.tmall.com/item.htm?id=549317594016&ns...,50,False,False,"[珀莱雅, 弹润, 芯, 肌, 活力, 眼霜, 去, 补水, 细纹, 紧致, 抗皱, 抗衰老..."
51,121484013,593792520101,1159,大宝官方旗舰店,129.0,大宝眼霜女提拉紧致小红帽眼部精华按摩眼部护理,500327991,上海,//detail.tmall.com/item.htm?id=593792520101&ns...,18,False,False,"[大宝, 眼霜, 女, 提拉, 紧致, 小红帽, 眼部, 精华, 按摩, 眼部, 护理]"
52,121484013,586188427656,317,温碧泉旗舰店,313.0,温碧泉初生精华眼霜抗初老提拉紧致抗皱去黑眼圈祛眼袋淡化细纹女,661544293,广东,//detail.tmall.com/item.htm?id=586188427656&ns...,19,False,False,"[温碧泉, 初生, 精华, 眼霜, 抗初, 老, 提拉, 紧致, 抗皱, 去, 黑眼圈, 祛..."


## 2.数据分析专题

### 2.1 市场分析

In [284]:
data1 = df.nick.value_counts(dropna=False).reset_index(name='item_count').sort_values('item_count',ascending=False)
bar1 = (
    pyecharts.charts.Bar()
    .add_xaxis(data1['index'].head(10).tolist())
    .add_yaxis('SKU数量', data1.head(10).item_count.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title='眼霜市场 - 商品数量TOP10店铺'))
)
bar1.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdc730f48>

In [285]:
bar1.render_notebook()

In [286]:
data2 = df[['view_sales', 'nick']].groupby(['nick']).sum().sort_values('view_sales', ascending=False)
bar2 = (
    pyecharts.charts.Bar()
    .add_xaxis(data2.head(10).index.tolist())
    .add_yaxis('眼霜销量', data2.head(10).view_sales.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title='眼霜市场 - 商品销量TOP10店铺'))
)
bar2.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdffce948>

In [287]:
bar.render_notebook()

In [288]:
data3 =df[df.is_marubi_shop].nick.value_counts(dropna=False).reset_index(name='item_count').sort_values('item_count',ascending=False)
bar3 = (
    pyecharts.charts.Bar()
    .add_xaxis(data3['index'].head(10).tolist())
    .add_yaxis('丸美专卖店SKU数量', data3.head(10).item_count.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title='丸美眼霜 - 商品数量TOP10店铺'))
)
bar3.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdfdd81c8>

In [289]:
bar3.render_notebook()

In [290]:
data4 = df[df.is_marubi_item][['view_sales', 'nick']].groupby(['nick']).sum().sort_values('view_sales', ascending=False)
bar4 = (
    pyecharts.charts.Bar()
    .add_xaxis(data4.head(10).index.tolist())
    .add_yaxis('丸美眼霜销量', data4.head(10).view_sales.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title='丸美眼霜 - 商品销量TOP10店铺'))
)
bar4.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffe00bedc8>

In [291]:
bar4.render_notebook()

### 结论分析一

#### 整体市场

1. 除去天猫超市，天猫国际等综合品牌店铺，各个品牌的眼霜SKU相差不大。
2. 从销量上看，各类品牌的销量差距较大，天猫超市、玖美堂旗舰店、丸美位列销量前三，并且远高于其他品牌旗舰店

#### 丸美眼霜

1. 丸美各个专卖店的SKU差距不大，评价在16个SKU左右；
2. 丸美眼霜的大部分销量都来自丸美旗舰店和丸美娇颜专卖店；
3. 丸美旗舰店的旗舰标识和较多的SKU使其成为丸美眼霜销售的核心店铺，对推广整个丸美眼霜产品有重要的影响力；
4. 丸美娇颜专门店虽然SKU较少，但销量位列第二，可以重点关注在丸美娇颜专门打造爆款。

### 2.2 卖点分析

In [292]:
keywords_count_list_all = jieba.analyse.textrank(' '.join(df.raw_title.tolist()), topK=50, withWeight=True)
word_cloud1 = (
    WordCloud()
    .add("", keywords_count_list_all, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title='眼霜市场 - 功能卖点TOP20'))
)
word_cloud1.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdb4cce08>

In [294]:
word_cloud1.render_notebook()

In [295]:
keywords_count_list_marubi = jieba.analyse.textrank(' '.join(df[df.is_marubi_item].raw_title.tolist()), topK=50, withWeight=True)
word_cloud2 = (
    WordCloud()
    .add("", keywords_count_list_marubi, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title='丸美眼霜 - 功能卖点TOP20'))
)
word_cloud2.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdbaaf308>

In [296]:
word_cloud2.render_notebook()

In [297]:
def filter_keyword_sales(df, keyword):
    return int(df[df.raw_title_cut.map(lambda x: keyword in x)]['view_sales'].mean())

In [298]:
keywords_mean_sales_all = {i[0]:filter_keyword_sales(df,i[0]) for i in reversed(keywords_count_list_all[:10])}
bar = (
    pyecharts.charts.Bar()
    .add_xaxis(list(keywords_mean_sales_all.keys()))
    .add_yaxis("", list(keywords_mean_sales_all.values()))
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position='right'))
    .set_global_opts(
        title_opts = opts.TitleOpts(title="眼霜市场 - 功能卖点与平均销量TOP10"),
        yaxis_opts = opts.AxisOpts(name='功能'),
        xaxis_opts = opts.AxisOpts(name='平均销量')
    )
)
bar.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdecd3808>

In [299]:
bar.render_notebook()

In [300]:
keywords_mean_sales_marubi = {i[0]:filter_keyword_sales(df[df.is_marubi_item],i[0]) for i in reversed(keywords_count_list_all[:10])}
bar = (
    pyecharts.charts.Bar()
    .add_xaxis(list(keywords_mean_sales_marubi.keys()))
    .add_yaxis("", list(keywords_mean_sales_marubi.values()))
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position='right'))
    .set_global_opts(
        title_opts = opts.TitleOpts(title="丸美眼霜 - 功能卖点与平均销量TOP10"),
        yaxis_opts = opts.AxisOpts(name='功能'),
        xaxis_opts = opts.AxisOpts(name='平均销量')
    )
)
bar.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdcb52888>

In [301]:
bar.render_notebook()

In [302]:
def filter_keyword_count(df, keyword):
    return int(df[df.raw_title_cut.map(lambda x: keyword in x)].shape[0])

In [303]:
keywords_mean_count_all = {i[0]:filter_keyword_sales(df,i[0]) for i in reversed(keywords_count_list_all[:10])}
bar = (
    pyecharts.charts.Bar()
    .add_xaxis(list(keywords_mean_count_all.keys()))
    .add_yaxis("", list(keywords_mean_count_all.values()))
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position='right'))
    .set_global_opts(
        title_opts = opts.TitleOpts(title="眼霜市场 - 功能与商品数量 TOP10"),
        yaxis_opts = opts.AxisOpts(name='功能'),
        xaxis_opts = opts.AxisOpts(name='商品数量')
    )
)
bar.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdc945c08>

In [304]:
bar.render_notebook()

In [305]:
keywords_mean_count_marubi = {i[0]:filter_keyword_sales(df[df.is_marubi_item],i[0]) for i in reversed(keywords_count_list_all[:10])}
bar = (
    pyecharts.charts.Bar()
    .add_xaxis(list(keywords_mean_count_marubi.keys()))
    .add_yaxis("", list(keywords_mean_count_marubi.values()))
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position='right'))
    .set_global_opts(
        title_opts = opts.TitleOpts(title="丸美眼霜 - 功能与商品数量 TOP10"),
        yaxis_opts = opts.AxisOpts(name='功能'),
        xaxis_opts = opts.AxisOpts(name='商品数量')
    )
)
bar.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdfdd8c88>

In [306]:
bar.render_notebook()

### 结论分析二

#### 整体市场

1. 在整体眼霜行业，商家产品依旧是主打眼霜的基本功能淡化细纹、补水保湿，各类功能卖点在销量上都是差比不大。
2. 在销量方面，产品功能的销量大致和商家宣传的热点共相似，__但其中"提拉"眼霜产品数量虽然较少，但销量位居第五名，可能是未来买家的消费新趋势__.

#### 丸美眼霜

1. 丸美的卖点与整体行业相似，但由于在天猫上销售丸美产品的店铺多达 __126__ 家，销售代理方面有些复杂，导致"官网"、"正品"等成为商家们的宣传口号。
    - __潜在问题__：过多的专卖店和代理店铺可能导致商品销售商家宣传过于混乱，各自标榜正品,导致买家在品牌认知上难以一致，对旗舰店会造成品牌认知干扰。
2. 丸美眼霜的商品数量和销量大致呈线性关系，产品功能开发商与市场需求比较符合。
3. 含"正品"关键字的产品数量高达176个的原因：
    - 主要是专卖店的商品宣传中都会含"正品"关键字,其中天猫上所有专卖店标有"正品"的商品合计为126个。

In [307]:
def filter_price(df_series,num):
#     step 1:
#         按照：MAX = Q3+(Q3-Q1)*1.5
#               MIX = Q1-(Q3-Q1)*1.5
#         剔除异常值，在分区
#     step 2:
#         对区间的值进行修正
    qmin_ = df_series.quantile(0.25) - (df_series.quantile(0.75)-df_series.quantile(0.25))*1.5
    qmax_ = df_series.quantile(0.75) + (df_series.quantile(0.75)-df_series.quantile(0.25))*1.5
    tmin = df_series.min()
    tmax = df_series.max()
    def filter_(x, min_, max_):
        if x<min_: return min_
        if x>max_: return max_
        return x
    return pd.cut(df_series.map(lambda x: filter_(x,qmin_, qmax_)),num)

def fix_cut(x, min_, max_):
    price, price_cut = x[0], x[1]
    if price<price_cut.left:
        return pd.Interval(min_, price_cut.right)
    elif price>price_cut.right:
        return pd.Interval(price_cut.left, max_)
    else:
        return price_cut

In [308]:
df['price_cuts'] = filter_price(df.view_price, 10)
df['price_cuts'] = df[['view_price', 'price_cuts']].apply(lambda x: fix_cut(x,df.view_price.min(), df.view_price.max()),axis=1)

In [309]:
data = df.price_cuts.value_counts().reset_index(name='count').sort_values('index')
data['index'] = data['index'].map(str)
bar4 = (
    pyecharts.charts.Bar()
    .add_xaxis(data['index'].tolist())
    .add_yaxis('', data['count'].tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title='眼霜市场 - 商品价格区间分布柱状图'))
)
bar4.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffdfed5188>

In [310]:
bar4.render_notebook()

In [311]:
data = df.price_cuts.value_counts().reset_index(name='count').sort_values('index')
data['index'] = data['index'].map(str)
pie = (
    pyecharts.charts.Pie()
    .add("", data.values.tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title=''))
    .set_series_opts(title_opts=opts.LabelOpts(formatter="{b}:{c}"))
)
pie.load_javascript()

<pyecharts.render.display.Javascript at 0x1ffe027e248>

In [312]:
print("眼霜市场 - 商品价格区间分布饼图\n")
pie.render_notebook()

眼霜市场 - 商品价格区间分布饼图



## 结论分析三

1. 眼霜整体市场的价格分布主要集中在57-204左右，大约占比58%