# 字段说明
——表1 user.csv：

- user_id 用户id
- register_time 注册时间
- recently_logged 最近访问时间
- learn_time 学习时间（分）
- number_of_classes_join 加入班级数
- number_of_classes_out 退出班级数
- school 用户所属学校

——表 2 study_information.csv 字段说明：

- user_id 用户 id
- course_id 课程 id
- course_join_time 加入课程的时间
- learn_process 学习进度
- price 课程单价

——表 3 login.csv 字段说明：

- 字段名 描述
- user_id 用户 id
- login_time 登录时间
- login_place 登录地址

In [1]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all' #默认为'last'

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import datetime
import jieba
%matplotlib inline
import matplotlib.pyplot as plt

# 任务二：用户整体情况分析
- 任务 2.1 分别绘制各省份与各城市平台登录次数热力地图，并分析用户分布情况。
- 任务 2.2 分别绘制工作日与非工作日各时段的用户登录次数柱状图，并分析用户活跃的主要时间段。
- 任务 2.3 记𝑇𝑇𝑒𝑒𝑒𝑒𝑒𝑒为数据观察窗口截止时间（如：赛题数据的采集截止时间为2020 年 6 月 18 日），𝑇𝑇𝑖𝑖为用户 i 的最近访问时间，𝜎𝜎𝑖𝑖 = 𝑇𝑇𝑒𝑒𝑒𝑒𝑒𝑒 − 𝑇𝑇𝑖𝑖，若𝜎𝜎𝑖𝑖 > 90天，则称用户 i 为流失用户。根据该定义计算平台用户的流失率。
- 任务 2.4 根据任务 2.1 至任务 2.3，分析平台用户的活跃度，为该教育平台的线上管理决策提供建议。

## 解决思路
- 用户分布分析：根据海内外、省份分析、乡镇分析入手,找到核心差异点所在
- 用户活跃度分析：细分整体情况与工作日差异
- 用户流失情况分析：细分整体情况与用户流失风险
- 线上管理决策建议：宣传、活跃度、流失为切口进行分析

## 用户分布分析

### 海外分布

In [2]:
# 重新读取已经切分好城市的数据
login = pd.read_csv('./part01/login_area_split.csv')
login

Unnamed: 0,user_id,login_time,login_place,国家,省份,地区
0,用户3,2018-09-06 09:32:47,中国广东广州,中国,广东,广州
1,用户3,2018-09-07 09:28:28,中国广东广州,中国,广东,广州
2,用户3,2018-09-07 09:57:44,中国广东广州,中国,广东,广州
3,用户3,2018-09-07 10:55:07,中国广东广州,中国,广东,广州
4,用户3,2018-09-07 12:28:42,中国广东广州,中国,广东,广州
...,...,...,...,...,...,...
387139,用户44247,2020-06-18 07:41:22,中国湖北武汉,中国,湖北,武汉
387140,用户44247,2020-06-18 08:13:28,中国湖北,中国,湖北,
387141,用户44248,2020-06-18 09:09:07,中国天津,中国,天津,
387142,用户44249,2020-06-18 09:43:15,中国北京,中国,北京,


In [3]:
login['国家'].value_counts()

中国    386914
英国       151
越南        27
德国        24
荷兰         8
波兰         7
捷克         4
南非         3
泰国         2
希腊         1
瑞士         1
挪威         1
瑞典         1
Name: 国家, dtype: int64

In [4]:
foreign_country = login[login['国家']!='中国']['国家'].value_counts().index.tolist()
foreign_country_login = login[login['国家']!='中国']['国家'].value_counts().values.tolist()

from pyecharts.globals import ThemeType
from pyecharts.charts import Pie, Line, Grid
from pyecharts import options as opts


line = (
    Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add_xaxis(foreign_country)
    .add_yaxis('foreign_country',
               foreign_country_login,
               label_opts=opts.LabelOpts(is_show=False),
               markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='max',name='最多登录次数'),
                                                       opts.MarkPointItem(type_='min',name='最小登录次数')]),
               markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_='average',name='平均登录次数')])
    )
    .set_global_opts(legend_opts=opts.LegendOpts(is_show=False))
)


pie = (
    Pie(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add(
         series_name="foreign country",
         data_pair=[list(z) for z in zip(foreign_country, foreign_country_login)],
         radius=["45%", "65%"],
         center=["70%", "45%"],
         label_opts=opts.LabelOpts(is_show=False)
    )
    .set_global_opts(legend_opts=opts.LegendOpts(is_show=True,
                                                 orient="horizontal",#vertical
                                                 pos_top="1%",
                                                 pos_right="1%",
                                                 legend_icon="circle")
                    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(trigger="item", 
                                                   formatter="{a} <br/>{b}: {c} ({d}%)"),
    )
)

grid = (
    Grid(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add(line, grid_opts=opts.GridOpts(pos_left="10%"))
    .add(pie, grid_opts=opts.GridOpts(pos_right="1%"))
)
grid.render_notebook()

In [5]:
login.groupby('国家').count()['user_id']

国家
中国    386914
南非         3
希腊         1
德国        24
挪威         1
捷克         4
波兰         7
泰国         2
瑞典         1
瑞士         1
英国       151
荷兰         8
越南        27
Name: user_id, dtype: int64

- 用户主要业务主要集中在中国，用户数占比在99%以上
- 海外地区业务占比很小很小，相对而言在欧洲地区有零星几位，其中英国用户占比较高

### 国内分布

In [6]:
china_province = login[login['国家']=='中国']['省份'].value_counts().index.tolist()
china_province_login = login[login['国家']=='中国']['省份'].value_counts().values.tolist()
colors = ['#C7E1D4','#ccebc5','#a8ddb5','#7bccc4','#4eb3d3','#2b8cbe','#0868ac','#084081']

from pyecharts import options as opts
from pyecharts.charts import Bar

c = (
    Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add_xaxis(china_province)
    .add_yaxis("中国省份", 
               china_province_login)
    .set_global_opts(title_opts=opts.TitleOpts(title="中国各省的登陆情况",
                                               pos_left="center"),
                     datazoom_opts=opts.DataZoomOpts(),
                     visualmap_opts=opts.VisualMapOpts(max_=max(china_province_login),
                                                       min_=min(china_province_login),
                                                       range_color=colors[:5],
                                                       range_opacity=100,
                                                       split_number=5,
                                                       orient="vertical",
                                                       pos_right="1%",
                                                       pos_top="center"),
                     legend_opts=opts.LegendOpts(is_show=True,
                                                 pos_left="10%")
                    )                                                      
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                     markpoint_opts=opts.MarkPointOpts(
                                            data=[opts.MarkPointItem(type_="max", name="最大值"),
                                                  opts.MarkPointItem(type_="min", name="最小值"),
                                                  opts.MarkPointItem(type_="average", name="平均值")]),
                     markline_opts=opts.MarkLineOpts(
                                            data=[opts.MarkLineItem(type_="average",name="平均登录次数")])
                    )
)
c.render_notebook()

In [7]:
from pyecharts import options as opts
from pyecharts.charts import Map

c = (
    Map(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add("登录次数", [list(z) for z in zip(china_province, china_province_login)], "china")
    .set_global_opts(title_opts=opts.TitleOpts(title="中国地域登录分布",
                                               pos_left='center'),
                     visualmap_opts=opts.VisualMapOpts(max_=40000,
                                                       #min_=min(china_province_login),
                                                       range_color=colors[2::],
                                                       range_opacity=100),
                     legend_opts=opts.LegendOpts(is_show=False))
)
c.render_notebook()

In [8]:
province_login_cnt = login[login['国家']=='中国'].groupby('省份').count()[['user_id']]
province_users_cnt = login.groupby(['省份','user_id']).count().reset_index()[['省份','user_id']].groupby('省份').count()
province_avg_login = round(province_login_cnt/province_users_cnt,2)

In [9]:
province_login_cnt.columns = ['登录总数']
province_users_cnt.columns = ['用户总数']
province_avg_login.columns = ['平均登录']

In [10]:
china_login_summary = pd.concat([province_login_cnt, province_users_cnt, province_avg_login], axis=1, join='inner')

In [11]:
china_login_summary['登录次数百分比'] = round(china_login_summary['登录总数']*100/china_login_summary['登录总数'].sum(),2)
china_login_summary.sort_values(by='登录总数',ascending=False,inplace=True)
china_login_summary['登录数累积百分比'] = round(china_login_summary['登录总数'].cumsum()*100/china_login_summary['登录总数'].sum(),2)

In [12]:
china_login_summary

Unnamed: 0_level_0,登录总数,用户总数,平均登录,登录次数百分比,登录数累积百分比
省份,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
广东,120887,8981,13.46,31.86,31.86
湖北,33149,3049,10.87,8.74,40.6
贵州,18786,1274,14.75,4.95,45.55
河南,18550,1730,10.72,4.89,50.44
山东,14874,1843,8.07,3.92,54.36
河北,14708,1711,8.6,3.88,58.24
广西,14052,1432,9.81,3.7,61.94
浙江,13366,1885,7.09,3.52,65.46
重庆,13163,1295,10.16,3.47,68.93
湖南,13103,1509,8.68,3.45,72.38


- 地域分布：该平台业务覆盖较为广泛，全国各省份皆有用户分布
- 集中分布：用户主要集中分布在华中、华南、华东地区
- 稀疏分布：较为偏远地区：西藏，新疆，内蒙古分布较少，同时港澳的使用用户也较少

In [13]:
top5_province = china_login_summary['登录总数'][:5].index.tolist()
top5_province

['广东', '湖北', '贵州', '河南', '山东']

- 将地区为nan的值替换为暂无

In [14]:
login[login['地区'].isnull()]

Unnamed: 0,user_id,login_time,login_place,国家,省份,地区
8,用户3,2018-09-10 14:04:32,中国北京,中国,北京,
10,用户3,2018-09-10 17:38:36,中国广东,中国,广东,
371,用户3,2019-01-17 10:35:32,中国香港,中国,香港,
437,用户3,2019-02-18 10:18:09,中国北京,中国,北京,
445,用户3,2019-02-19 10:11:49,中国北京,中国,北京,
...,...,...,...,...,...,...
387125,用户44226,2020-06-17 10:41:57,中国贵州,中国,贵州,
387126,用户44227,2020-06-17 11:43:48,中国陕西,中国,陕西,
387140,用户44247,2020-06-18 08:13:28,中国湖北,中国,湖北,
387141,用户44248,2020-06-18 09:09:07,中国天津,中国,天津,


In [15]:
login['地区'] = login['地区'].fillna('暂无')

### 广东省分布(top1)

In [16]:
# 广东
guangdong_login = login[login['省份']=='广东'].groupby('地区').count()[['user_id']].rename(columns={'user_id':'登录次数'}).reset_index()
guangdong_login.sort_values(by='登录次数',ascending=False,inplace=True)

In [17]:
guangdong_login

Unnamed: 0,地区,登录次数
4,广州,27626
7,暂无,18509
9,汕头,10146
13,深圳,9098
5,惠州,6557
6,揭阳,6083
15,湛江,5500
0,东莞,4565
3,佛山,4217
8,梅州,4026


In [18]:
from pyecharts import options as opts
from pyecharts.charts import Pie

c = (
    Pie(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add(
        "",
        [list(z) for z in zip(guangdong_login['地区'], guangdong_login['登录次数'])],
        center=["35%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="广东省各地区登录人数占比",pos_left="center"),
        legend_opts=opts.LegendOpts(is_show=False,pos_left="15%"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{c} {d}%"))
)
c.render_notebook()

- 广东的各区域分布(120887)：
    - 广州(登陆数量)：27626(22.85%)
    - 汕头(登陆数量)：10146(8.39%)
    - 深圳(登陆数量)：9098(7.53%)
    - 惠州(登陆数量)：6557(5.42%)
    - 缺失数据：18509(15.31%)

### 湖北省分布(top2)

In [19]:
# 湖北
hubei_login = login[login['省份']=='湖北'].groupby('地区').count()[['user_id']].rename(columns={'user_id':'登录次数'}).reset_index()
hubei_login.sort_values(by='登录次数',ascending=False,inplace=True)

In [20]:
hubei_login

Unnamed: 0,地区,登录次数
7,暂无,8516
8,武汉,6534
16,黄冈,2935
13,襄阳,2359
11,荆州,2038
4,孝感,1733
12,荆门,1693
15,随州,1203
1,十堰,1005
2,咸宁,942


In [21]:
from pyecharts import options as opts
from pyecharts.charts import Pie

c = (
    Pie(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add(
        "",
        [list(z) for z in zip(hubei_login['地区'], hubei_login['登录次数'])],
        center=["35%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="湖北省各地区登录人数占比",pos_left="center"),
        legend_opts=opts.LegendOpts(is_show=False,pos_left="15%"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{c} {d}%"))
)
c.render_notebook()

- 湖北的各区域分布(33149)：
    - 武汉(登陆数量)：6534(26.53%)
    - 黄冈(登陆数量)：2935(11.91%)
    - 襄阳(登陆数量)：2359(9.58%)
    - 荆州(登陆数量)：2038(8.27%)
    - 缺失数据：8516(25.69%)

## 用户活跃度分析

In [22]:
login['login_time'] = pd.to_datetime(login.login_time) 
login['date'] = login['login_time'].dt.date
login['weekday'] = login['login_time'].dt.weekday
login['hour'] = login['login_time'].dt.hour
login['time'] = login['login_time'].dt.timetz
#weekday-->Monday:0---Tuesday:1---Wednesday:2---Thursday:3---Friday:4---Saturday:5---Sunday:6
login['date'] = pd.to_datetime(login['date'])

In [23]:
login['date'].max()

Timestamp('2020-06-18 00:00:00')

In [24]:
import datetime
from chinese_calendar import is_workday,is_holiday

login['is_holiday'] = login['date'].apply(lambda x: '工作日' if is_workday(x) else '非工作日')

In [25]:
login

Unnamed: 0,user_id,login_time,login_place,国家,省份,地区,date,weekday,hour,time,is_holiday
0,用户3,2018-09-06 09:32:47,中国广东广州,中国,广东,广州,2018-09-06,3,9,09:32:47,工作日
1,用户3,2018-09-07 09:28:28,中国广东广州,中国,广东,广州,2018-09-07,4,9,09:28:28,工作日
2,用户3,2018-09-07 09:57:44,中国广东广州,中国,广东,广州,2018-09-07,4,9,09:57:44,工作日
3,用户3,2018-09-07 10:55:07,中国广东广州,中国,广东,广州,2018-09-07,4,10,10:55:07,工作日
4,用户3,2018-09-07 12:28:42,中国广东广州,中国,广东,广州,2018-09-07,4,12,12:28:42,工作日
...,...,...,...,...,...,...,...,...,...,...,...
387139,用户44247,2020-06-18 07:41:22,中国湖北武汉,中国,湖北,武汉,2020-06-18,3,7,07:41:22,工作日
387140,用户44247,2020-06-18 08:13:28,中国湖北,中国,湖北,暂无,2020-06-18,3,8,08:13:28,工作日
387141,用户44248,2020-06-18 09:09:07,中国天津,中国,天津,暂无,2020-06-18,3,9,09:09:07,工作日
387142,用户44249,2020-06-18 09:43:15,中国北京,中国,北京,暂无,2020-06-18,3,9,09:43:15,工作日


In [26]:
login['is_holiday'].value_counts()

工作日     280462
非工作日    106682
Name: is_holiday, dtype: int64

### 用户星期活跃度

In [27]:
active_day_info = login[login['is_holiday']=='工作日'].groupby('weekday').count()[['user_id']].reset_index()\
                .rename(columns={'user_id':'工作日活跃度'})

In [28]:
active_day_info['非工作日活跃度'] = login[login['is_holiday']=='非工作日'].groupby('weekday').count()[['user_id']]

In [29]:
active_day_info

Unnamed: 0,weekday,工作日活跃度,非工作日活跃度
0,0,58384,5788
1,1,57439,2225
2,2,55331,639
3,3,53973,501
4,4,50241,3047
5,5,1894,43969
6,6,3200,50513


In [30]:
active_day = active_day_info['weekday'].values.tolist()
active_workday_day = active_day_info['工作日活跃度'].values.tolist()
active_holiday_day = active_day_info['非工作日活跃度'].values.tolist()

from pyecharts.charts import Bar
from pyecharts.globals import ThemeType

c = (
    Bar({"theme": ThemeType.MACARONS})
    .add_xaxis(active_day)
    .add_yaxis("workday", active_workday_day)
    .add_yaxis("holiday", active_holiday_day)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="用户日常活c度分析", subtitle="工作日与非工作日差异")
        )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                     markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max",name="Max"),
                                                             opts.MarkPointItem(type_="min",name="Min")]
                                                      ),
                     markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average",name="Average")])
                    )
)
c.render_notebook()

### 用户小时活跃度

In [31]:
active_hour_info = login[login['is_holiday']=='工作日'].groupby('hour').count()[['user_id']].reset_index()\
                    .rename(columns={'user_id':'工作日小时活跃度'})

In [32]:
active_hour_info['非工作日小时活跃度'] = login[login['is_holiday']=='非工作日'].groupby('hour').count()[['user_id']]

In [33]:
active_hour_info

Unnamed: 0,hour,工作日小时活跃度,非工作日小时活跃度
0,0,3520,1538
1,1,1315,628
2,2,612,323
3,3,350,148
4,4,215,96
5,5,242,118
6,6,786,297
7,7,3734,1131
8,8,13963,3854
9,9,20203,6422


In [34]:
active_hour = active_hour_info['hour'].values.tolist()
active_workday_hour = active_hour_info['工作日小时活跃度'].values.tolist()
active_holiday_hour = active_hour_info['非工作日小时活跃度'].values.tolist()

from pyecharts.charts import Bar
from pyecharts.globals import ThemeType

c = (
    Bar({"theme": ThemeType.MACARONS})
    .add_xaxis(active_hour)
    .add_yaxis("workday", active_workday_hour)
    .add_yaxis("holiday", active_holiday_hour)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="用户小时活跃度分析", subtitle="工作日与非工作日差异")
        )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                     markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max",name="Max"),
                                                             opts.MarkPointItem(type_="min",name="Min")]
                                                      ),
                     markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average",name="Average")])
                    )
)
c.render_notebook()

- 不同时期用户活跃度差异：
    - 相较于非工作日，工作日各个时段的登录量均远高于非工作日的登录量
    - 工作日平均登录登录量：11685、非工作日平均登录量：4445.08
    - 工作日和非工作日的登陆频次基本一致，均在上午 8：00-11：00， 下午 14：00-17：00，晚上 20：00-21：00 出现三个高峰，波动趋势基本一致。
    - 工作日中上午、下午、夜间的登录表现程一定程度的递减
    - 非工作日的三个时间段的登陆表现则较为均衡

#### 利用users表

In [35]:
users = pd.read_csv('./part01/users.csv',encoding='utf-8')
users.head()

Unnamed: 0,user_id,register_time,recently_logged,number_of_classes_join,number_of_classes_out,learn_time,school,school_info
0,用户44251,2020/6/18 9:49,2020/6/18 9:49,0,0,41.25,,0
1,用户44250,2020/6/18 9:47,2020/6/18 9:48,0,0,0.0,,0
2,用户44249,2020/6/18 9:43,2020/6/18 9:43,0,0,16.22,,0
3,用户44248,2020/6/18 9:09,2020/6/18 9:09,0,0,0.0,,0
4,用户44247,2020/6/18 7:41,2020/6/18 8:15,0,0,1.8,,0


In [36]:
users['register_time'] = pd.to_datetime(users['register_time'],errors='coerce')
users['recently_logged'] = pd.to_datetime(users['recently_logged'],errors='coerce')

In [37]:
users['recently_logged_date'] = users['recently_logged'].dt.date
users['recently_logged_date'] = pd.to_datetime(users['recently_logged_date'],errors='coerce')
recently_logged_date = users[users['recently_logged_date']>='2020-01-01'].groupby(by='recently_logged_date').user_id.count().index.tolist()
recently_logged_count = users[users['recently_logged_date']>='2020-01-01'].groupby(by='recently_logged_date').user_id.count().values.tolist()

In [38]:
# 利用users表根据用户的登录时间进行活跃度用户可视化

from pyecharts.globals import ThemeType
from pyecharts.charts import Line
from pyecharts import options as opts

line = (
    Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
    .add_xaxis(recently_logged_date)
    .add_yaxis('登录人数',
               recently_logged_count,
               label_opts=opts.LabelOpts(is_show=False),
               markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='max',name='最多登录次数'),
                                                       opts.MarkPointItem(type_='min',name='最小登录次数')]),
               markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_='average',name='平均登录次数')])
    )
    .set_global_opts(legend_opts=opts.LegendOpts(is_show=False),
                     datazoom_opts=opts.DataZoomOpts(),)
)

line.render_notebook()

In [39]:
def getMaxNumIndex(index_list, index_num):
    maxIndex, maxNum = -1, -1
    for index, num in zip(index_list, index_num):
        if num > maxNum: 
            maxIndex, maxNum = index, num
    return maxIndex, maxNum

In [40]:
getMaxNumIndex(recently_logged_date, recently_logged_count)

(Timestamp('2020-06-17 00:00:00'), 491)

- 利用users表发现，2020年初至该数据统计截止的那一天，登陆人数在2020-06-17达到顶峰

## 用户流失率分析
结合该平台的行为特征，通过样本采集时间与用户最近登录时间（时间差值）将用户划分为不同群体

In [41]:
miss_recently_logged = users[users['recently_logged'].isnull()]

In [42]:
stu_info = pd.read_csv('./part01/study_information.csv',encoding='utf-8')

In [43]:
# users表中最近登录时间字段存在缺失值，为上述转化日期不成功的数据
# 这里使用用户最近加入的课程的加入时间代替，仍有缺失值表示该客户流失
study_info = stu_info.merge(miss_recently_logged,on='user_id',how='inner')
study_info

Unnamed: 0,user_id,course_id,course_join_time,learn_process,price,register_time,recently_logged,number_of_classes_join,number_of_classes_out,learn_time,school,school_info,recently_logged_date
0,用户29,课程12,2018-09-30 19:18:13,width: 52%;,0.0,2018-09-30 19:17:00,NaT,0,0,100.32,,0,NaT
1,用户197,课程196,2018-10-26 19:10:25,width: 0%;,0.0,2018-10-25 19:53:00,NaT,0,0,3.1,,0,NaT
2,用户197,课程194,2018-10-26 19:10:19,width: 0%;,0.0,2018-10-25 19:53:00,NaT,0,0,3.1,,0,NaT
3,用户197,课程193,2018-10-25 19:54:23,width: 0%;,0.0,2018-10-25 19:53:00,NaT,0,0,3.1,,0,NaT
4,用户197,课程33,2018-10-25 19:54:15,width: 0%;,0.0,2018-10-25 19:53:00,NaT,0,0,3.1,,0,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18122,用户44239,课程98,2020-06-17 17:25:57,width: 0%;,99.0,2020-06-17 17:24:00,NaT,1,0,2109.75,,0,NaT
18123,用户44239,课程7,2020-06-17 17:25:57,width: 4%;,159.0,2020-06-17 17:24:00,NaT,1,0,2109.75,,0,NaT
18124,用户44240,课程6,2020-06-17 17:26:08,width: 100%;,179.0,2020-06-17 17:25:00,NaT,1,0,1667.28,,0,NaT
18125,用户44240,课程98,2020-06-17 17:26:08,width: 100%;,99.0,2020-06-17 17:25:00,NaT,1,0,1667.28,,0,NaT


In [44]:
user_max_course_join_time = study_info.groupby('user_id').agg({'course_join_time':'max'}).reset_index()
user_max_course_join_time

Unnamed: 0,user_id,course_join_time
0,用户10266,2019-05-08 15:06:45
1,用户10268,2019-05-08 15:06:45
2,用户10269,2019-05-08 15:06:45
3,用户10270,2019-05-08 15:06:45
4,用户10276,2019-05-08 15:06:45
...,...,...
5178,用户8801,2019-04-15 22:04:11
5179,用户8803,2019-04-15 22:13:54
5180,用户8808,2019-04-15 23:02:54
5181,用户8822,2019-04-16 09:39:59


In [45]:
fill_miss_logged = miss_recently_logged.merge(user_max_course_join_time,on='user_id',how='left')
fill_miss_logged

Unnamed: 0,user_id,register_time,recently_logged,number_of_classes_join,number_of_classes_out,learn_time,school,school_info,recently_logged_date,course_join_time
0,用户44240,2020-06-17 17:25:00,NaT,1,0,1667.28,,0,NaT,2020-06-17 17:26:08
1,用户44239,2020-06-17 17:24:00,NaT,1,0,2109.75,,0,NaT,2020-06-17 17:25:57
2,用户44235,2020-06-17 16:39:00,NaT,1,0,0,,0,NaT,2020-06-17 16:40:57
3,用户44237,2020-06-17 16:39:00,NaT,1,0,10348.62,,0,NaT,2020-06-17 16:40:58
4,用户44232,2020-06-17 16:39:00,NaT,1,0,9054.72,,0,NaT,2020-06-17 16:40:55
...,...,...,...,...,...,...,...,...,...,...
5370,用户214,2018-10-25 20:46:00,NaT,0,0,0,,0,NaT,2018-10-25 20:47:22
5371,用户197,2018-10-25 19:53:00,NaT,0,0,3.1,,0,NaT,2018-10-26 19:10:25
5372,用户151,2018-10-25 18:26:00,NaT,0,0,0,,0,NaT,
5373,用户117,2018-10-25 17:47:00,NaT,0,0,0,,0,NaT,


In [46]:
for i in users['user_id']:
    if i in list(fill_miss_logged['user_id']):
        fill_recently_logged = fill_miss_logged[fill_miss_logged['user_id']==i]['course_join_time'].values[0]
        users.loc[users['user_id']==i,'recently_logged'] = fill_recently_logged
    else:
        pass

In [47]:
users['recently_logged'] = pd.to_datetime(users['recently_logged'])
users['recently_logged_date'] = users['recently_logged'].dt.date

In [48]:
users['acquisition_time'] = '2020-6-18 00:00:00'
users['acquisition_time'] = pd.to_datetime(users['acquisition_time'])
users['time_range'] = users['acquisition_time'] - users['recently_logged']
users['time_range'] = users['time_range'].dt.days + 1

In [49]:
users[['time_range']].describe()

Unnamed: 0,time_range
count,43721.0
mean,190.122481
std,170.86054
min,0.0
25%,49.0
50%,115.0
75%,343.0
max,646.0


In [50]:
bins = [0,60,90,999]
users['user_label'] = pd.cut(x=users['time_range'],bins=bins,labels=['活跃用户','潜在用户','流失用户'])

In [51]:
from pyecharts import options as opts
from pyecharts.charts import Bar

bar = (
       Bar({"theme": ThemeType.MACARONS}) 
       .add_xaxis(users['user_label'].value_counts().index.tolist())
       .add_yaxis("",users['user_label'].value_counts().values.tolist())
       .set_global_opts(title_opts=opts.TitleOpts(
                                                  title="用户群体分类",
                                                  pos_left="center"),
                        )
      )
    
bar.render_notebook()

In [52]:
# 用户流失率
loss_rate = round((users['user_label'].value_counts()['流失用户'])/sum(users['user_label'].value_counts().values.tolist()),4)

In [53]:
from pyecharts import options as opts
from pyecharts.charts import Liquid
from pyecharts.globals import SymbolType

c = (
    Liquid()
    .add("流失率", [loss_rate], is_outline_show=False, shape=SymbolType.DIAMOND)
    .set_global_opts(title_opts=opts.TitleOpts(title="用户流失率",
                                              pos_right="center"))
)
c.render_notebook()

In [54]:
line = ( 
        Line({"theme": ThemeType.MACARONS})
        .add_xaxis(users.groupby('time_range').count()[['user_id']].index.tolist())
        .add_yaxis("",users.groupby('time_range').count()[['user_id']]['user_id'].values.tolist())
        .set_series_opts(
                         label_opts=opts.LabelOpts(is_show=False),
                         markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max",name="Max"),
                                                                opts.MarkPointItem(type_="min",name="Min")]),
                         markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average",name="Average")])
                        )
        .set_global_opts(title_opts=opts.TitleOpts(title="用户最近登录时间差值",
                                                   subtitle="与统计的截止时间之差",
                                                   pos_left="center"),
                         datazoom_opts=opts.DataZoomOpts())
)

line.render_notebook() 

## 用户流失漏斗模型

In [55]:
# 提取进度learn_process里的数值
stu_info['learning_process'] = stu_info['learn_process'].str.extract(r'(\d+)', expand=False).astype(int)

In [56]:
# 获取不同价格课程的学习进度情况 
price_learn_process = stu_info.groupby('price').agg({'learning_process':['mean','max','min']}).reset_index()
price_learn_process

Unnamed: 0_level_0,price,learning_process,learning_process,learning_process
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,min
0,0.0,13.884284,100,0
1,29.0,8.156425,100,0
2,49.0,39.728261,100,0
3,59.0,5.345745,100,0
4,79.0,20.66376,100,0
5,99.0,12.871117,100,0
6,109.0,37.989782,100,0
7,129.0,2.90418,100,0
8,159.0,15.65374,100,0
9,169.0,64.067901,100,0


In [57]:
stu_info['course_join_time'] = pd.to_datetime(stu_info['course_join_time'])
stu_info['date'] = pd.to_datetime(stu_info['course_join_time'].dt.date)

In [58]:
# 注册用户数
login_users_count = len(users['user_id'].value_counts())
# 加入课程用户数
join_course_users_count = len(stu_info.groupby('user_id').count().index)
# 开始学习用户数
start_course_users_count = len(stu_info[stu_info['learning_process']>0].groupby('user_id').count().index)
# 完成学习用户数
complete_course_users_count = len(stu_info[stu_info['learning_process']==100].groupby(['user_id']).count().index)

In [59]:
state = ['注册','加入课程','开始学习','完成课程']
state_count = [login_users_count, join_course_users_count, start_course_users_count, complete_course_users_count]
course_state_info = pd.DataFrame({'环节':state,'人数':state_count})

In [60]:
course_state_info['人数'].shift(1)

0        NaN
1    43908.0
2    40807.0
3    27605.0
Name: 人数, dtype: float64

In [61]:
# 每个环节的转化率:当前环节人数/上一个环节人数
# 整体转化率:当前环节人数/总人数
course_state_info['总体转化率'] = round(course_state_info['人数']*100/course_state_info.iloc[0,1],2)
course_state_info['环节转化率'] = round(course_state_info['人数']*100/course_state_info['人数'].shift(1,fill_value=43908),2)

In [62]:
course_state_info

Unnamed: 0,环节,人数,总体转化率,环节转化率
0,注册,43908,100.0,100.0
1,加入课程,40807,92.94,92.94
2,开始学习,27605,62.87,67.65
3,完成课程,7792,17.75,28.23


In [63]:
from pyecharts import options as opts
from pyecharts.charts import Funnel

c = (
    Funnel({"theme": ThemeType.MACARONS})
    .add(
        "用户状态",
        [list(z) for z in zip(list(course_state_info['环节']), list(course_state_info['总体转化率']))],
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="总体转化率")
                    )
    .set_series_opts(label_opts = opts.LabelOpts(formatter='{b}:{c}%')
                    )
)
c.render_notebook()

In [64]:
c = (
    Funnel()
    .add(
        "用户状态",
        [list(z) for z in zip(list(course_state_info['环节']), list(course_state_info['环节转化率']))],
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="环节转化率")
                    )
    .set_series_opts(label_opts = opts.LabelOpts(formatter='{b}:{c}%')
                    )
)
c.render_notebook()

## 用户精细化运营(RFM Model)

In [65]:
user_r = users[['user_id','time_range']]
user_f = stu_info.groupby('user_id').count()[['course_id']].reset_index()
user_m = stu_info.groupby('user_id').sum().reset_index()

In [66]:
user_rfm = user_r.merge(user_f,on='user_id',how='right').merge(user_m,on='user_id',how='right')
user_rfm.rename(columns = {'time_range':'R','course_id':'F','price':'M'},inplace=True)

In [67]:
user_rfm

Unnamed: 0,user_id,R,F,M,learning_process
0,用户44247,0.0,3,998.0,104
1,用户44246,1.0,1,0.0,4
2,用户44245,1.0,1,0.0,0
3,用户44243,1.0,1,0.0,4
4,用户44241,1.0,1,0.0,4
...,...,...,...,...,...
40807,用户9,8.0,16,1825.0,0
40808,用户7,105.0,33,5133.0,0
40809,用户5,3.0,74,11518.0,23
40810,用户4,13.0,91,13522.0,28


In [68]:
user_rfm.R.describe()

count    40812.000000
mean       189.739317
std        171.569653
min          0.000000
25%         47.000000
50%        113.000000
75%        344.000000
max        628.000000
Name: R, dtype: float64

In [69]:
bins_r = [0, 30, 60, 90, 999]
labels_r = [4, 3, 2, 1]
user_rfm['R_score'] = pd.cut(user_rfm['R'], bins=bins_r, labels=labels_r, include_lowest=True).astype(int)

In [70]:
user_rfm.F.describe()

count    40812.000000
mean         4.778570
std          7.654752
min          1.000000
25%          1.000000
50%          1.000000
75%          4.000000
max        105.000000
Name: F, dtype: float64

In [71]:
bins_f = [0, 1.1, 3.1, 5.1, 999]
label_f = [1, 2, 3, 4]
user_rfm['F_score'] = pd.cut(user_rfm['F'], bins=bins_f, labels=label_f, include_lowest=True).astype(int)

In [72]:
user_rfm.M.describe()

count    40812.000000
mean       896.838332
std       2269.530349
min          0.000000
25%          0.000000
50%          0.000000
75%        707.000000
max      22310.000000
Name: M, dtype: float64

In [73]:
bins_m = [0, 1.1, 500, 1000, 9999999]
label_m = [1, 2, 3, 4]
user_rfm['M_score'] = pd.cut(user_rfm['M'], bins=bins_m, labels=label_m, include_lowest=True).astype(int)

In [74]:
user_rfm

Unnamed: 0,user_id,R,F,M,learning_process,R_score,F_score,M_score
0,用户44247,0.0,3,998.0,104,4,2,3
1,用户44246,1.0,1,0.0,4,4,1,1
2,用户44245,1.0,1,0.0,0,4,1,1
3,用户44243,1.0,1,0.0,4,4,1,1
4,用户44241,1.0,1,0.0,4,4,1,1
...,...,...,...,...,...,...,...,...
40807,用户9,8.0,16,1825.0,0,4,4,4
40808,用户7,105.0,33,5133.0,0,1,4,4
40809,用户5,3.0,74,11518.0,23,4,4,4
40810,用户4,13.0,91,13522.0,28,4,4,4


In [75]:
R_score_type = []
for i in user_rfm['R_score']:
    if i > user_rfm['R_score'].mean():
        R_score_type.append('高')
    else:
        R_score_type.append('低')
        
F_score_type = []
for i in user_rfm['F_score']:
    if i > user_rfm['F_score'].mean():
        F_score_type.append('高')
    else:
        F_score_type.append('低')
        
M_score_type = []
for i in user_rfm['M_score']:
    if i > user_rfm['M_score'].mean():
        M_score_type.append('高')
    else:
        M_score_type.append('低')
        
user_rfm['R_score_type'] = R_score_type
user_rfm['F_score_type'] = F_score_type
user_rfm['M_score_type'] = M_score_type
user_rfm['RFM_type'] = user_rfm['R_score_type'] + user_rfm['F_score_type'] + user_rfm['M_score_type']

In [76]:
user_rfm

Unnamed: 0,user_id,R,F,M,learning_process,R_score,F_score,M_score,R_score_type,F_score_type,M_score_type,RFM_type
0,用户44247,0.0,3,998.0,104,4,2,3,高,高,高,高高高
1,用户44246,1.0,1,0.0,4,4,1,1,高,低,低,高低低
2,用户44245,1.0,1,0.0,0,4,1,1,高,低,低,高低低
3,用户44243,1.0,1,0.0,4,4,1,1,高,低,低,高低低
4,用户44241,1.0,1,0.0,4,4,1,1,高,低,低,高低低
...,...,...,...,...,...,...,...,...,...,...,...,...
40807,用户9,8.0,16,1825.0,0,4,4,4,高,高,高,高高高
40808,用户7,105.0,33,5133.0,0,1,4,4,低,高,高,低高高
40809,用户5,3.0,74,11518.0,23,4,4,4,高,高,高,高高高
40810,用户4,13.0,91,13522.0,28,4,4,4,高,高,高,高高高


In [77]:
user_type = pd.DataFrame()
user_type['user_score'] = ['高高高','高低高','低高高','低低高','高高低','高低低','低高低','低低低']
user_type['user_star'] = ['重要价值客户','重要发展客户','重要保持客户','重要挽留客户',\
                           '一般价值客户', '一般发展客户','一般保持客户','一般挽留客户']

In [78]:
user_type

Unnamed: 0,user_score,user_star
0,高高高,重要价值客户
1,高低高,重要发展客户
2,低高高,重要保持客户
3,低低高,重要挽留客户
4,高高低,一般价值客户
5,高低低,一般发展客户
6,低高低,一般保持客户
7,低低低,一般挽留客户


In [79]:
user_info_df = user_rfm.merge(user_type,how='left',left_on='RFM_type', right_on='user_score')
user_info_df

Unnamed: 0,user_id,R,F,M,learning_process,R_score,F_score,M_score,R_score_type,F_score_type,M_score_type,RFM_type,user_score,user_star
0,用户44247,0.0,3,998.0,104,4,2,3,高,高,高,高高高,高高高,重要价值客户
1,用户44246,1.0,1,0.0,4,4,1,1,高,低,低,高低低,高低低,一般发展客户
2,用户44245,1.0,1,0.0,0,4,1,1,高,低,低,高低低,高低低,一般发展客户
3,用户44243,1.0,1,0.0,4,4,1,1,高,低,低,高低低,高低低,一般发展客户
4,用户44241,1.0,1,0.0,4,4,1,1,高,低,低,高低低,高低低,一般发展客户
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40807,用户9,8.0,16,1825.0,0,4,4,4,高,高,高,高高高,高高高,重要价值客户
40808,用户7,105.0,33,5133.0,0,1,4,4,低,高,高,低高高,低高高,重要保持客户
40809,用户5,3.0,74,11518.0,23,4,4,4,高,高,高,高高高,高高高,重要价值客户
40810,用户4,13.0,91,13522.0,28,4,4,4,高,高,高,高高高,高高高,重要价值客户


In [80]:
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType

c = (
    Bar({"theme": ThemeType.MACARONS})
    .add_xaxis(user_info_df.groupby('user_star').count()[['user_id']].index.tolist())
    .add_yaxis("用户类型", user_info_df.groupby('user_star').count()['user_id'].values.tolist())
    .set_global_opts(xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
                     title_opts=opts.TitleOpts(title="用户类型数量对比"),
    )
)
c.render_notebook()

In [81]:
# 重要用户占比
important_user_rate = round(sum(user_info_df.groupby('user_star').count()['user_id'].values[4:].tolist())  \
     /sum(user_info_df.groupby('user_star').count()['user_id'].values.tolist()),2)
important_user_rate

0.43

In [82]:
from pyecharts import options as opts
from pyecharts.charts import Liquid

c = (
    Liquid()
    .add("lq", [important_user_rate])
    .set_global_opts(title_opts=opts.TitleOpts(title="重要用户占比",pos_right="center"))
)
c.render_notebook()

In [83]:
important_user_info = user_info_df[(user_info_df['user_star']=='重要保持客户')| \
                                   (user_info_df['user_star']=='重要发展客户')| \
                                   (user_info_df['user_star']=='重要挽留客户')| \
                                   (user_info_df['user_star']=='重要价值客户')][['user_id','user_star']]

In [84]:
user_place_info = login.drop_duplicates(subset='user_id')[['user_id','国家','省份','地区']]

In [85]:
important_user = important_user_info.merge(user_place_info, on='user_id', how='inner')

In [86]:
important_user

Unnamed: 0,user_id,user_star,国家,省份,地区
0,用户44247,重要价值客户,中国,湖北,武汉
1,用户44198,重要价值客户,中国,湖南,暂无
2,用户44190,重要价值客户,中国,河北,保定
3,用户44187,重要价值客户,中国,陕西,渭南
4,用户44179,重要价值客户,中国,广东,广州
...,...,...,...,...,...
15936,用户10,重要保持客户,中国,广东,广州
15937,用户9,重要价值客户,中国,广东,广州
15938,用户7,重要保持客户,中国,广东,广州
15939,用户5,重要价值客户,中国,广东,广州


In [87]:
important_user_province_top10 = important_user.groupby('省份').count()[['地区']].sort_values(by='地区',ascending=False)[:10]
important_user_province_top10

Unnamed: 0_level_0,地区
省份,Unnamed: 1_level_1
广东,3939
湖北,1481
河北,804
河南,758
贵州,696
山东,672
湖南,671
四川,632
浙江,616
广西,614


In [88]:
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.commons.utils import JsCode

c = (
    Bar()
    .add_xaxis(important_user_province_top10.index.tolist())
    .add_yaxis("", important_user_province_top10['地区'].values.tolist(), category_gap="60%")
    .set_series_opts(
        itemstyle_opts={
            "normal": {
                "color": JsCode(
                    """new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
                offset: 0,
                color: 'rgba(0, 244, 255, 1)'
            }, {
                offset: 1,
                color: 'rgba(0, 77, 167, 1)'
            }], false)"""
                ),
                "barBorderRadius": [30, 30, 30, 30],
                "shadowColor": "rgb(0, 160, 221)",
            }
        }
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="各省份重要用户数量对比",
                                               subtitle="排名top10",
                                               pos_left="center"),
                     datazoom_opts=opts.DataZoomOpts(type_="inside"))
)
c.render_notebook()

In [89]:
important_user_region_top10 = important_user.groupby("地区").count()[["省份"]].sort_values(by="省份",ascending=False)[:10]
important_user_region_top10

Unnamed: 0_level_0,省份
地区,Unnamed: 1_level_1
暂无,4067
广州,826
汕头,296
武汉,291
深圳,254
保定,200
湛江,194
成都,191
揭阳,184
东莞,181


In [90]:
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.commons.utils import JsCode

c = (
    Bar()
    .add_xaxis(important_user_region_top10.index.tolist())
    .add_yaxis("", important_user_region_top10['省份'].values.tolist(), category_gap="60%")
    .set_series_opts(
        itemstyle_opts={
            "normal": {
                "color": JsCode(
                    """new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
                offset: 0,
                color: 'rgba(0, 244, 255, 1)'
            }, {
                offset: 1,
                color: 'rgba(0, 77, 167, 1)'
            }], false)"""
                ),
                "barBorderRadius": [30, 30, 30, 30],
                "shadowColor": "rgb(0, 160, 221)",
            }
        }
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="各地区重要用户数量对比",
                                               subtitle="排名top10",
                                               pos_left="center"),
                     datazoom_opts=opts.DataZoomOpts(type_="inside"))
)
c.render_notebook()

# 任务三：线上课程推荐
- **任务 3.1** ：根据用户参与学习的记录，统计每门课程的参与人数，计算每门课程的受欢迎程度，列出最受欢迎的前 10 门课程，并绘制相应的柱状图。受欢迎程度定义如下：𝛾𝛾𝑖𝑖 = 𝑄𝑄𝑖𝑖 − 𝑄𝑄min/𝑄𝑄max− 𝑄𝑄min。其中，𝛾𝛾𝑖𝑖为第 i 门课程的受欢迎程度，𝑄𝑄𝑖𝑖为参与第 i 门课程学习的人数，𝑄𝑄max和𝑄𝑄min分别为所有课程中参与人数最多和最少的课程所对应的人数。
- **任务 3.2** ：根据用户选择课程情况，构建用户和课程的关系表（二元矩阵），使用基于物品的协同过滤算法计算课程之间的相似度，并结合用户已选课程的记录，为总学习进度最高的 5 名用户推荐 3 门课程。
- **任务 3.3** ：在任务 3.1 和任务 3.2 的基础上，结合用户学习进度数据，分析付费课程和免费课程的差异，给出线上课程的综合推荐策略。

## 用户课程选择分析

### 选课人数最多课程

In [91]:
# 选择人数最多的top30课程
from pyecharts import options as opts
from pyecharts.charts import Bar

c = (
    Bar()
    .add_xaxis(stu_info.course_id.value_counts().index[:30].tolist())
    .add_yaxis("course_id", stu_info.course_id.value_counts().values[:30].tolist())
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)),
        title_opts=opts.TitleOpts(title="选课人数最多课程top30"),
        datazoom_opts=opts.DataZoomOpts(type_="inside"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                     markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max",name="Max"),
                                                             opts.MarkPointItem(type_="min",name="Min")]),
                     markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average",name="Average")])
                    )
)
c.render_notebook()

### 最受欢迎的免费课程

In [92]:
c = (
    Bar()
    .add_xaxis(stu_info[stu_info['price']==0].course_id.value_counts().index[:30].tolist())
    .add_yaxis("free_course_id", stu_info[stu_info['price']==0].course_id.value_counts().values[:30].tolist())
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)),
        title_opts=opts.TitleOpts(title="最受欢迎的免费课程top30"),
        datazoom_opts=opts.DataZoomOpts(type_="inside"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                     markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max",name="Max"),
                                                             opts.MarkPointItem(type_="min",name="Min")]),
                     markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average",name="Average")])
                    )
)
c.render_notebook()

### 最受欢迎收费课程

In [93]:
c = (
    Bar()
    .add_xaxis(stu_info[stu_info['price']!=0].course_id.value_counts().index[:30].tolist())
    .add_yaxis("fee_course_id", stu_info[stu_info['price']!=0].course_id.value_counts().values[:30].tolist())
    .set_global_opts(
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)),
        title_opts=opts.TitleOpts(title="最受欢迎的收费课程top30"),
        datazoom_opts=opts.DataZoomOpts(type_="inside"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                     markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max",name="Max"),
                                                             opts.MarkPointItem(type_="min",name="Min")]),
                     markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="average",name="Average")])
                    )
)
c.render_notebook()

### 按区间分析选择课程人数

In [94]:
course_info = stu_info.groupby('course_id').count()[['user_id']].reset_index()\
                      .rename(columns={'user_id':'课程选择人数'}).sort_values(by='课程选择人数',ascending=False)

# 通过选择人数，划分课程类型
bins = [0,100,500,1000,5000,10000,999999]
labels = ["<100","100-500","500-1000","1000-5000","5000-10000",">10000"]
course_info["选择人数类型"] = pd.cut(x=course_info['课程选择人数'],bins=bins,labels=labels)
course_info

Unnamed: 0,course_id,课程选择人数,选择人数类型
215,课程76,13265,>10000
166,课程31,9521,5000-10000
79,课程17,8505,5000-10000
103,课程191,7126,5000-10000
91,课程180,6223,5000-10000
...,...,...,...
162,课程28,2,<100
232,课程91,1,<100
233,课程92,1,<100
234,课程93,1,<100


In [95]:
from pyecharts import options as opts
from pyecharts.charts import Pie

c = (Pie(init_opts=opts.InitOpts())
    .add(
        series_name="选择课程人数",
        data_pair=[list(z) for z in zip(course_info.groupby("选择人数类型").count().index.tolist(),\
                                        course_info.groupby("选择人数类型").count()['course_id'].values.tolist())],
        radius=["50%", "70%"],
        label_opts=opts.LabelOpts(is_show=False, position="center"),
        )
    .set_global_opts(legend_opts=opts.LegendOpts(pos_left="legft", 
                                                 orient="vertical", 
                                                 pos_top="center"),
                     title_opts=opts.TitleOpts(title="选择各类课程人数占比",
                                               pos_left="center")
                    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(trigger="item", 
                                                   formatter="{a} <br/>{b}: {c} ({d}%)")
                    )
                    
)
c.render_notebook()

In [96]:
stu_info.groupby(['price']).agg({'learning_process':['sum','mean'],'user_id':['count']})

Unnamed: 0_level_0,learning_process,learning_process,user_id
Unnamed: 0_level_1,sum,mean,count
price,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0.0,987006,13.884284,71088
29.0,1460,8.156425,179
49.0,7310,39.728261,184
59.0,1005,5.345745,188
79.0,21325,20.66376,1032
99.0,19474,12.871117,1513
109.0,431260,37.989782,11352
129.0,57465,2.90418,19787
159.0,11302,15.65374,722
169.0,10379,64.067901,162


- 可以看出课程的价格为109时候，用户学习时间和数量都相对较优，但是随着价格的上升，用户学习时间突然下降

In [97]:
stu_info[stu_info['price']==109].groupby(['course_id']).agg({'learning_process':['mean','count']})

Unnamed: 0_level_0,learning_process,learning_process
Unnamed: 0_level_1,mean,count
course_id,Unnamed: 1_level_2,Unnamed: 2_level_2
课程11,81.209302,43
课程30,0.0,2
课程31,38.663061,9521
课程66,34.009085,1541
课程85,29.587755,245


- 可以看出在价格为109的时候，选择人数和课程学习时间均较长

### 学习时长最久的课程
- 可以将没有人学习的，或者是相对较少人学习的课程进行删除，而学习时长较长的课程为用户认为较为优质的课程，可以考虑后续重点推广

In [98]:
stu_info[stu_info['learning_process']==100].groupby(['course_id']).agg({'learning_process':['mean'],'price':['mean']})

Unnamed: 0_level_0,learning_process,price
Unnamed: 0_level_1,mean,mean
course_id,Unnamed: 1_level_2,Unnamed: 2_level_2
课程101,100,0.0
课程106,100,0.0
课程107,100,0.0
课程108,100,0.0
课程109,100,0.0
...,...,...
课程95,100,499.0
课程96,100,
课程97,100,29.0
课程98,100,99.0


## 课程受欢迎度分析

In [99]:
min_x = min(course_info['课程选择人数'])
max_x = max(course_info['课程选择人数'])
course_info['受欢迎程度'] = course_info['课程选择人数'].apply(lambda x:(x-min_x)/(max_x-min_x)*100).sort_values(ascending=False)

In [100]:
course_info

Unnamed: 0,course_id,课程选择人数,选择人数类型,受欢迎程度
215,课程76,13265,>10000,100.000000
166,课程31,9521,5000-10000,71.773221
79,课程17,8505,5000-10000,64.113390
103,课程191,7126,5000-10000,53.716828
91,课程180,6223,5000-10000,46.908926
...,...,...,...,...
162,课程28,2,<100,0.007539
232,课程91,1,<100,0.000000
233,课程92,1,<100,0.000000
234,课程93,1,<100,0.000000


In [101]:
bar = (
        Bar(init_opts=opts.InitOpts()) 
        .add_xaxis(course_info.iloc[0:10,0].values.tolist()) 
        .add_yaxis("",course_info.iloc[0:10,1].values.tolist())
        .extend_axis(
            yaxis=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value}"), interval=10)
        )
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(title_opts=opts.TitleOpts(title="最受欢迎课程选择人数与受欢迎程度",pos_left="center"),
                         yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value}")),
                        )
    )

line = (Line()
        .add_xaxis(course_info.iloc[0:10,0].values.tolist())
        .add_yaxis("",course_info.iloc[0:10,3].values.tolist(),yaxis_index=1)
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
       )

bar.overlap(line).render_notebook()