In [10]:
import pandas as pd
import numpy as np
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
from pyecharts import options as opts
from pyecharts.charts import Map, Bar, Line, Timeline, Pie
from pyecharts.components import Table
from pyecharts.options import ComponentTitleOpts
from pyecharts.faker import Faker


In [11]:
# 这是一份来自和鲸社区的倒闭企业数据集，总计 6,272 条记录，大小为 2.3 M，包含 21 个字段。
data = pd.read_csv("com.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6272 entries, 0 to 6271
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   bianh         6272 non-null   int64  
 1   com_name      6272 non-null   object 
 2   com_addr      6272 non-null   object 
 3   cat           6272 non-null   object 
 4   se_cat        6271 non-null   object 
 5   com_des       6271 non-null   object 
 6   born_data     6272 non-null   object 
 7   death_data    6272 non-null   object 
 8   live_days     6272 non-null   int64  
 9   financing     6272 non-null   object 
 10  total_money   805 non-null    float64
 11  death_reason  1419 non-null   object 
 12  invest_name   600 non-null    object 
 13  ceo_name      4839 non-null   object 
 14  ceo_des       4838 non-null   object 
 15  ceo_per_des   4839 non-null   object 
 16  Unnamed: 16   0 non-null      float64
 17  Unnamed: 17   0 non-null      float64
 18  Unnamed: 18   0 non-null    

In [12]:
data.duplicated().sum()

0

In [13]:
# 倒闭公司的地区分布
data["com_addr"] =  data["com_addr"].str.strip()

dict = data.groupby("com_addr").size().to_dict()

# dict

map = (
    Map()
    .add("倒闭公司数量",[*dict.items()], "china")
    .set_global_opts(
        title_opts=opts.TitleOpts(title="地区分布"),
        visualmap_opts = opts.VisualMapOpts(max_=200),
    )
)

map.render_notebook()

In [14]:
# 行业排行top10
category = data.groupby("cat").size().sort_values(ascending=False)[:10]

dict = category.to_dict()

bar = (
    Bar()
    .add_xaxis([*dict.keys()])
    .add_yaxis("倒闭数量",[*dict.values()], label_opts = opts.LabelOpts(position="top"))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="行业排行top10"),
    )
)

bar.render_notebook()

In [15]:
# 细分领域top20
secondary_category = data.groupby("se_cat").size().sort_values(ascending=False)[:20].sort_values()

dict = secondary_category.to_dict()

bar = (
    Bar()
    .add_xaxis([*dict.keys()])
    .add_yaxis("倒闭数量",[*dict.values()], label_opts = opts.LabelOpts(position="right"))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="行业排行top10"),
    )
    .reversal_axis()
)

bar.render_notebook()

In [19]:
# 年份分布
data["born_data"] = pd.to_datetime(data["born_data"])
data["death_data"] = pd.to_datetime(data["death_data"])

data.rename({"born_data":"born_date","death_data":"death_date"}, axis=1, inplace=True)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6272 entries, 0 to 6271
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   bianh         6272 non-null   int64         
 1   com_name      6272 non-null   object        
 2   com_addr      6272 non-null   object        
 3   cat           6272 non-null   object        
 4   se_cat        6271 non-null   object        
 5   com_des       6271 non-null   object        
 6   born_date     6272 non-null   datetime64[ns]
 7   death_date    6272 non-null   datetime64[ns]
 8   live_days     6272 non-null   int64         
 9   financing     6272 non-null   object        
 10  total_money   805 non-null    float64       
 11  death_reason  1419 non-null   object        
 12  invest_name   600 non-null    object        
 13  ceo_name      4839 non-null   object        
 14  ceo_des       4838 non-null   object        
 15  ceo_per_des   4839 non-null   object  

In [20]:
born_com = data.groupby(data["born_date"].dt.year).size().to_dict()
dead_com = data.groupby(data["death_date"].dt.year).size().to_dict()

# merge 2 df
df1 = pd.DataFrame({"year": born_com.keys(), "born": born_com.values()})
df2 = pd.DataFrame({"year": dead_com.keys(), "dead": dead_com.values()})

df = pd.merge(df1, df2, on="year")

df =  df[df["year"].astype("str") > "2008"].reset_index(drop=True).sort_values(by="year")

# df

bar = (
    Bar()
    .add_xaxis(list(df["year"]))
    .add_yaxis("新成立企业数",list(df["born"]),label_opts = opts.LabelOpts(position="top"))
    .add_yaxis("倒闭企业数",list(df["dead"]), label_opts = opts.LabelOpts(position="top"))
    .set_global_opts(title_opts=opts.TitleOpts(title="年份分布"))
)

bar.render_notebook()

In [21]:
# 企业存活时间
# data.info()

def live_year(x):
    if x < 365:
        return '不到1年'
    if x < 365 * 2:
        return '1-2年'
    if x < 365 * 3:
        return '2-3年'
    if x < 365 * 4:
        return '3-4年'
    if x < 365 * 5:
        return '4-5年'
    if x < 365 * 10:
        return '5-10年'
    return '10年以上'

s = data.groupby(data['live_days'].apply(lambda x: live_year(x))).size()

pie = (
    Pie()
    .add("企业存活时间",[*s.items()])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c} ({d}%)"))
)

pie.render_notebook()

In [55]:
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType

# 投资人词云
# investors = {  s.split("&") for s in data["invest_name"] if not pd.isna(i)}
investors = {}
for s in data["invest_name"]:
    if not pd.isna(s):
        for i in s.split("&"):
            investors[i] = investors.get(i, 0) + 1


investors =  {v[0]:v[1] for v in sorted(investors.items(), key=lambda x : x[1], reverse=True)}

wc = (
    WordCloud()
    .add("投资人", [*investors.items()],  shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title="投资人词云"))
)

wc.render_notebook()

In [66]:
# 倒闭词云
death_reasons = {}
for s in data["death_reason"]:
    if not pd.isna(s):
        for i in s.split():
            death_reasons[i] = death_reasons.get(i, 0) + 1

death_reasons = {i[0]:i[1] for i in sorted(death_reasons.items(), key= lambda x : x[1], reverse=True)}

wc = (
    WordCloud()
    .add("", [*death_reasons.items()][:10], shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title="倒闭词云"))
)

wc.render_notebook()

In [74]:
import jieba

# ceo描述词云

des = {}

for s in data["ceo_per_des"].values:
    if not pd.isna(s):
        for word in jieba.lcut(s):
            if len(word) == 1:
                break
            des[word] = des.get(word, 0)+1

des = {i[0]:i[1] for i in sorted(des.items(), key= lambda x : x[1], reverse=True)}
# des

wc = (
    WordCloud()
    .add("", [*des.items()][:20])
    .set_global_opts(title_opts=opts.TitleOpts(title="CEO描述词云"))
)

wc.render_notebook()