In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline
import plotly.express as px

In [2]:
# 将同一个文件夹中的多个csv文件中的数据合并到同一个dataframe之中
def get_merged_data(folder_path):
    # 创建一个空的 DataFrame 来存储数据
    merged_df = pd.DataFrame()
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            cate_id = file_name.split('_')[2]
            df['cate_id'] = cate_id
            # 将每个文件的数据追加到合并的 DataFrame 中
            merged_df = pd.concat([merged_df, df], ignore_index=True)
    return merged_df

#播放数量在10k-100k之间同时视频时长在5分钟以上
def get_play_between_10k_100k_long_videos(df):
    return df[(df['play'] > 10000) & (df['play'] > 100000) & (df['duration'] > 300)]

# 播放数量在100k以上同时视频时长在5分钟以上
def get_play_over_100k_long_videos(df):
    return df[(df['play'] > 100000) & (df['duration'] > 300)]

# 播放数量在1M以上同时视频时长在5分钟以上
def get_play_over_1M_long_videos(df):
    return df[(df['play'] > 1000000) & (df['duration'] > 300)]

def plot_scatter(x, y, x_label, y_label, title):
    """
    绘制散点图

    参数：
    x (list or array-like): X 轴数据
    y (list or array-like): Y 轴数据
    x_label (str): X 轴标签
    y_label (str): Y 轴标签
    title (str): 图表标题
    """
    fig = px.scatter(x=x, y=y)
    fig.show()
    
def plot_video_views_distribution_chart(df):
    categories = ['< 10,000', '10,000 - 100,000', '100k - 1M', '> 1M']
    counts = [
        df[df['play'] < 10000].shape[0],
        df[(df['play'] >= 10000) & (df['play'] <= 100000)].shape[0],
        df[(df['play'] >= 100000) & (df['play'] <= 1000000)].shape[0],
        df[df['play'] > 1000000].shape[0]
    ]
    
    # 绘制柱状图
    plt.bar(categories, counts)
    plt.xlabel('Views')
    plt.ylabel('Count')
    plt.title('Video Views Distribution')
    plt.show()
    
def calculate_mean_views_by_author(df):
    mean_play_by_authors = df.groupby('author')['play'].mean()
    return mean_play_by_authors

def search_videos_by_author(df, name):
    return df[df['author'].str.contains(name)]

# 去掉哔哩哔哩官方账号
def remove_bili_official_videos(df):
    bili_official_videos = df['author'].str.contains("哔哩哔哩")
    data = df[~bili_official_videos]
    return data
    
def sort_df_with_keyword(df, keyword):
    df.sort_values(keyword, ascending=False).head(100)

In [3]:
bili_hot_videos_path = 'bili-hot-videos'
df_all_hot_videos = get_merged_data(bili_hot_videos_path)

In [4]:
# 添加ratios
def calculate_ratios(df):
    df = df[df['play'] != 0]
    df['ratio_favorites'] = df['favorites'] / df['play']
    df['ratio_review'] = df['review'] / df['play']
    df['ratio_danmu'] = df['video_review'] / df['play']
    return df

In [5]:
# 进行转换
def convert_data_types(df):
    df['duration'] = df['duration'].astype(int)
    df['play'] = df['play'].astype(int)
    df['review'] = df['review'].astype(int)
    df['favorites'] = df['favorites'].astype(int)
    df['pubdate'] = pd.to_datetime(df['pubdate'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    return df

In [6]:
df_all_cc = df_all_hot_videos.copy()
df_all_cc.head(10)

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id
0,BV1rS4y157TW,http://www.bilibili.com/video/av680812294,刚买车都很心疼，尤其是第一台车，等过了一段时间就没那么在乎了，你们也是这样吗？,2022-01-19 22:53:54,1,厦门二手车海同学,480011256,7735491,374,142,2736,63,用车,-,227
1,BV1BT4y127ex,http://www.bilibili.com/video/av935381316,我评测过的所有中国车，油管网友竟最喜欢它,2022-01-06 17:23:43,2,车轮哥Wheelsboy,343937521,780287,986,1671,1202,267,"遇见热爱汽车的你,VLOG,日常,五菱,MINIEV,电动汽车,中国车,小车",从2020年到2021年，我拍过数十辆中国车。我已经知道中国网友都喜欢哪款，但是外国网友呢？...,227
2,BV1vZ4y1f7MU,http://www.bilibili.com/video/av380730964,缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》,2022-01-14 10:00:11,3,易车横评,702014294,776740,3248,4220,6766,965,"遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解",在上一集的10万公里测试中，理想ONE的表现还是比较理想的，没想到拆解后，却变得非常不理想，...,227
3,BV1iq4y1c7rH,http://www.bilibili.com/video/av550847490,1年卖上千台车的车贩子告诉你，哪些车千万不能买！,2022-01-18 17:48:50,4,小胡说车,456869112,656350,536,1118,651,325,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,奥迪,记录,知识,汽车,二手车",今天又是大家喜闻乐见的车贩子亏钱专场，快来看下哪些二手车最不值得买！,227
4,BV1yq4y117L6,http://www.bilibili.com/video/av592788930,详细测评魏牌拿铁DHT,2022-01-05 12:00:17,5,38号车评中心官方账号,39736779,617919,4073,17406,2195,1546,"遇见热爱汽车的你,原创,长城,魏派,魏牌,魏,拿铁,suv,测评",这次38号测评了魏牌拿铁DHT，这款车在许多方面相比前代产品有了巨大的改进和提高。具体内容敬请观看,227
5,BV1h3411e7JJ,http://www.bilibili.com/video/av422976170,别克终于“火”了！新车比帕拉梅拉还漂亮，25万奥迪A7都靠边站,2022-01-06 12:54:16,6,水墨车事,1807235269,566713,317,244,498,86,"遇见热爱汽车的你,大众,领克,丰田",,227
6,BV1H5411Z7JT,http://www.bilibili.com/video/av466103346,国民SUV强强对决 哈弗H6对拆长安CS75 PLUS,2022-01-28 11:38:08,7,易车横评,702014294,550549,2102,2583,3392,824,"汽车,汽车测评,哈弗H6,长安CS75 PLUS,拆解,自制",在本集节目里我们将对两部完成拆解的车进行关键位置的比对，看看玄冥二老中谁的结构设计更安全，技...,227
7,BV1XF411n7tG,http://www.bilibili.com/video/av296027309,花“1套房”钱买了一辆大兰博SUV，司机试过说摸着像奥迪？,2022-01-29 22:34:10,8,V哥聊车,1398983218,464385,548,364,515,274,URUS,裸车售价接近小300个W的兰博基尼URUS，真香吗？听听三位在车里是怎么聊的，到最后实在太让...,227
8,BV1oq4y1w7N5,http://www.bilibili.com/video/av550587655,国外社交平台最有争议的中国车，为什么是红旗H9？,2022-01-10 19:15:41,9,车轮哥Wheelsboy,343937521,450797,1008,667,741,129,"遇见热爱汽车的你,VLOG,豪车,红旗,中国车,红旗H9,H9",上一期我说到了五菱mini EV是外国网友最喜欢的中国车，这一集我们来聊聊最有争议的中国车。,227
9,BV1SR4y1K7xu,http://www.bilibili.com/video/av338451912,看到这样的国产车，真好【吉利 星越L】,2022-01-26 21:20:12,10,大家车言论,36044181,449744,1755,2929,1609,794,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,汽车,吉利,星越L,SUV,买车,新车,...",如果没有亲自开上这部车，你大概率会怀疑我们对它的赞誉：这不仅是吉利最高水准的作品，说它是同级...,227


In [7]:
'''
# 将DataFrameGroupBy转换为DataFrame
all_50k_authors = all_50k_cc.groupby('author').apply(lambda x: x.reset_index(drop=True)).reset_index()

# 每1000行进行分割并导出为JSON文件
chunk_size = 1000
num_chunks = len(df) // chunk_size + 1
for i in range(num_chunks):
    start = i * chunk_size
    end = (i + 1) * chunk_size
    chunk = df[start:end]
    filename = f"output_{i}.json"
    chunk.to_json(filename, orient='records', lines=True)

    print(f"Saved {len(chunk)} rows to {filename}")
'''


'\n# 将DataFrameGroupBy转换为DataFrame\nall_50k_authors = all_50k_cc.groupby(\'author\').apply(lambda x: x.reset_index(drop=True)).reset_index()\n\n# 每1000行进行分割并导出为JSON文件\nchunk_size = 1000\nnum_chunks = len(df) // chunk_size + 1\nfor i in range(num_chunks):\n    start = i * chunk_size\n    end = (i + 1) * chunk_size\n    chunk = df[start:end]\n    filename = f"output_{i}.json"\n    chunk.to_json(filename, orient=\'records\', lines=True)\n\n    print(f"Saved {len(chunk)} rows to {filename}")\n'

In [8]:
df_all_cc = calculate_ratios(df_all_cc)
df_all_cc = convert_data_types(df_all_cc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ratio_favorites'] = df['favorites'] / df['play']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ratio_review'] = df['review'] / df['play']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ratio_danmu'] = df['video_review'] / df['play']


In [12]:
# 查看播放量超过一百万视频中B站官方号数量
# play_over_1M_2023['author'].str.contains("哔哩").value_counts()

In [9]:
linyi_videos = search_videos_by_author(df_all_cc, '林亦LYi')

In [10]:
he_videos = search_videos_by_author(df_all_cc, '老师好我叫何同学')
he_videos

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
293372,BV1764y167Lp,http://www.bilibili.com/video/av759395091,【何同学】永远是同学,2021-07-25 20:30:16,2,老师好我叫何同学,163637592,12132478,37898,106377,477051,456,"vlog夏日挑战季,VLOG,生活,学习,记录,校园,毕业,你眼里的光,何同学,大学",制片：九老师\n翻页书：Flashcat闪电猫\n翻页书动画：超级小树\n动画：Joytee...,21,0.03932,0.003124,0.008768
380393,BV1AP411d7Qa,http://www.bilibili.com/video/av313690196,【何同学】为了找到流量密码，我们做了个假B站...,2023-05-16 19:30:00,1,老师好我叫何同学,163637592,7242598,6397,17898,144576,797,"封面,标题,B站,何同学,老师好我叫何同学,科技猎手2023,何同学的投票",在测试的过程中，由于频繁添加好友账号被限制、部分通讯账号搜索不到或无法添加、少量的重复添加或...,230,0.019962,0.000883,0.002471
1310192,BV19v411M7Rs,http://www.bilibili.com/video/av251187473,【何同学】我做了苹果放弃的产品...,2021-10-17 20:55:10,1,老师好我叫何同学,163637592,24433201,53318,79220,920263,468,"DIY,何同学,苹果,AirPower",非常感谢装机猿老师的帮助\n如果你想做一个AirPower，单纯的像视频里一样把线圈堆积起来...,233,0.037664,0.002182,0.003242
1403523,BV13v411v7Zo,http://www.bilibili.com/video/av246654776,【何同学】整理自己的生活（P2附库克采访）,2021-02-17 21:04:40,1,老师好我叫何同学,163637592,11345039,31092,122060,346521,1663,"VLOG,生活记录,生活,自制,VLOG日常,创意,寒假不咕咕,roomtour",制片：九老师\n剪辑：贰_33、小纯\n采访包装：小梦\n动画：joyteeth\n手绘海报...,239,0.030544,0.002741,0.010759
1450270,BV1244y1p7kt,http://www.bilibili.com/video/av978650098,【何同学】我用108天开了个灯......,2022-02-02 19:40:13,1,老师好我叫何同学,163637592,12995202,54392,72247,266035,500,"DIY,黑科技,发明创造,自制,VLOG",这个视频真的让我学习到很多 \n大家看完这个视频可能会有一种“就这？”的疑问，但是考虑到我在...,233,0.020472,0.004186,0.00556
1456270,BV1W14y1b7Mq,http://www.bilibili.com/video/av771908203,【何同学】我做了一个自己打字的键盘...,2022-08-12 20:00:00,1,老师好我叫何同学,163637592,26183964,10604,111716,869764,537,"何同学,老师好我叫何同学",本视频中的键盘为了视觉效果，选择了一个行程较长但是铜线圈数很少的电磁铁，并且设计为通电时升起...,233,0.033217,0.000405,0.004267
1496270,BV1Sk4y1471G,http://www.bilibili.com/video/av739233567,【何同学】我们做了一台中文打字机...,2023-03-26 19:00:00,1,老师好我叫何同学,163637592,12409626,14358,37859,475112,570,"科技猎手2023,何同学,老师好我叫何同学,中文打字机,打字机,明快打字机",非常高兴做了这期视频，感谢SmallRig的朋友帮我们加工了字杆，感谢差评君做的中文打字机相...,233,0.038286,0.001157,0.003051
2066395,BV1ir4y1H74w,http://www.bilibili.com/video/av768280609,【何同学】我找到了我最喜欢的数码产品，但是...,2022-04-17 21:21:58,1,老师好我叫何同学,163637592,14624356,25911,44652,507230,730,"科技猎手,手机,电脑,数码,3D打印机",因为疫情原因很多零件没有到位，视频中的Voron 0.1在拍摄时并没有安装完成。\n大部分打...,95,0.034684,0.001772,0.003053
2072395,BV1X8411e7EJ,http://www.bilibili.com/video/av219404688,【何同学】快充伤电池？40部手机两年实验，告诉你最佳充电方式,2022-10-27 19:30:00,1,老师好我叫何同学,163637592,13320697,18095,41487,505110,567,"科技猎手计划·2022第四期,科技猎手,充电,快充,何同学,电池健康,苹果,安卓,续航,科普",非常感谢充电头网的朋友们对本视频的帮助！\n感谢王也老师帮助我们找到配乐的朋友。 \n感谢时...,95,0.037919,0.001358,0.003114
2193205,BV1ev411x7en,http://www.bilibili.com/video/av246051041,【何同学】这视频能让你戒手机,2021-01-06 19:33:26,1,老师好我叫何同学,163637592,13381506,34351,99872,667916,738,"我和数码的日常,手机,平板,数码,安卓,苹果",制片：九老师\n动画：Joyteeth 无聊的阿祝\n特效：flashyami\n声效：Ji...,95,0.049913,0.002567,0.007463


In [11]:
def plot_basic_scatter_charts(df):
    fig = px.scatter(df, x='play', y='review', color='cate_id', hover_data=['bvid'])
    fig.show()
    fig = px.scatter(df, x='play', y='favorites', color='cate_id', hover_data=['bvid'])
    fig.show()
    fig = px.scatter(df, x='play', y='ratio_favorites', color='cate_id', size='duration', hover_data=['bvid', 'pubdate'])
    fig.show()
    fig = px.scatter(df, x='play', y='ratio_danmu', color='cate_id', size='duration', hover_data=['bvid','pubdate'])
    fig.show()
    fig = px.scatter(df, x='play', y='ratio_review', color='cate_id', size='duration', hover_data=['bvid', 'pubdate'])
    fig.show()
    fig = px.scatter(df, x='pubdate', y='ratio_review', color='cate_id', size='play', hover_data=['bvid'])
    fig.show()
    fig = px.scatter(df, x='pubdate', y='ratio_danmu', color='cate_id', size='play', hover_data=['bvid'])
    fig.show()
    fig = px.scatter(df, x='pubdate', y='ratio_favorites', color='cate_id', size='play', hover_data=['bvid'])
    fig.show()


In [12]:
# Filter rows where the publish year is 'year'
def get_year_data(df, year):
    df_filtered = df[df['pubdate'].dt.year == year]
    return df_filtered

In [13]:
df_2021 = get_year_data(df_all_cc, 2021)
df_2021.shape

(811649, 18)

In [14]:
# selected_columns = ['bvid', 'title', 'pubdate', 'rank_offset', 'author', 'mid', 'play']
# selected_sorted_data = all_100k_videos[selected_columns].sort_values('play', ascending=False)
# selected_sorted_data

In [15]:
# def generate_json_files_from_df(df, chunk_size):
#     num_chunks = len(selected_sorted_data) // chunk_size + 1
#     for i in range(num_chunks):
#         start_index = i * chunk_size
#         end_index = (i + 1) * chunk_size
#         chunk_df = selected_sorted_data[start_index:end_index]
#         chunk_df.to_json(f'{i}.json', orient='records')

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
df_2021_cc = df_2021.copy()

In [18]:
df_2021_cc

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
22691,BV1da4y1n7Lc,http://www.bilibili.com/video/av671047860,【小雪】不看后悔的竖屏OL姐姐舞蹈-up down,2021-01-06 17:45:13,1,x羊小雪X,23400436,5596665,1597,4364,134894,87,"出发吧2021,舞蹈,翻跳,热舞,韩舞",大家新年快乐！,199,0.024103,0.000285,0.00078
22692,BV1zX4y1K7Df,http://www.bilibili.com/video/av713717232,【舞小喵】no.9竖屏 你想要的全都有！,2021-01-13 18:00:07,2,舞小喵,2223018,4354973,1640,3579,88256,458,"C位终结战,韩舞,KPOP,明星舞蹈,翻跳,no.9,T-ara,蹦迪",黑丝跟白丝不相上下..所以两个竖屏满足一下大家~！,199,0.020266,0.000377,0.000822
22693,BV1i5411p7b2,http://www.bilibili.com/video/av458468634,【徐珺大哥】Rollin 椅子舞,2021-01-03 19:30:03,3,徐珺大哥,34579852,3603009,760,1587,55147,85,"C位终结战,徐珺大哥,韩舞,翻跳,性感,爵士舞,Rollin,椅子舞,Brave Girls",注意事项：高跟鞋站小板凳不要学习噢，小心崴脚。\n创作类型：翻跳\n摄影：徐翔\n剪辑：徐珺,199,0.015306,0.000211,0.00044
22694,BV1wy4y1m7XB,http://www.bilibili.com/video/av798638816,【女团永动机】经典女团性感【Wiggle Wiggle】女团被禁的舞再现 | 练习室运动系列...,2021-01-13 19:30:07,4,小马儿66,212946109,3045775,1399,2721,88897,157,"C位终结战,韩舞翻跳,女团舞,KPOP,韩舞,舞蹈,翻跳,性感,跳舞,wiggle wiggle",指路：\n蓝裤 @戚予珠珠\n黑裤 @导弹头是KIKI啊\n黄裤 @Nicole阿纽扣\n粉...,199,0.029187,0.000459,0.000893
22695,BV14T4y1T7Wk,http://www.bilibili.com/video/av928422964,高二学生元旦晚会psycho翻跳,2021-01-02 14:45:04,5,歪歪啵你大脑门子,351364093,2579592,3717,2590,33643,202,"PSYCHO,REDVELVET,韩舞翻跳,高中生,元旦晚会,现场",没有韩舞基础 中间有一些地方因为太紧张跳得不好 希望大家见谅,199,0.013042,0.001441,0.001004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284279,BV19f4y187ND,http://www.bilibili.com/video/av291185600,火星撞地球！英超“法德”世界波哪家强？,2021-06-15 19:57:38,996,英超联赛,1488532268,18662,63,256,84,375,"运动剪辑星,英超,足球,体育,英超联赛,欧洲杯",欧洲杯即将迎来法德大战\n英超法国球员、德国球员精彩进球奉上！\n\n关注英超联赛PL，英超...,249,0.004501,0.003376,0.013718
2284280,BV1Y64y1r7WW,http://www.bilibili.com/video/av761220102,葡萄牙3-0完胜，穆里尼奥眼里的MVP竟不是C罗？,2021-06-16 12:59:49,997,虎扑足球社区,521212790,18585,93,23,37,270,"足球,欧洲杯,穆里尼奥,C罗,葡萄牙",穆帅复盘葡萄牙3-0完胜匈牙利之战，全场最佳竟然不是C罗？,249,0.001991,0.005004,0.001238
2284281,BV1do4y1C7GV,http://www.bilibili.com/video/av375957606,"绯闻归化球员莱昂-琼斯比赛集锦-身强体壮,卡位准确,可谓新蒋光太",2021-06-04 15:15:47,998,数据体坛,592156933,18570,197,51,10,332,"运动剪辑星,足球,打卡挑战,体育,中超,中国队,归化球员,琼斯,中国足协",,249,0.000539,0.010609,0.002746
2284282,BV1qV41147Y8,http://www.bilibili.com/video/av418604950,[三语字幕]意大利国家队出征欧洲杯超燃官方宣传片,2021-06-12 13:55:39,999,劉傑瑞,542933823,18536,83,22,607,105,"体育,意大利,运动,欧洲杯,足球,意大利语,欧洲,意甲,国家队",https://www.youtube.com/watch?v=1oPLwEFU1fI,249,0.032747,0.004478,0.001187


In [21]:
def get_play_between_10k_100k_long_videos(df):
    return df[(df['play'] > 100000) & (df['duration'] > 300)]

In [22]:
def filter_df(df, year, num_views, count, duration):
    df_year = get_year_data(df, year)
    df_play_over_num_views = df_year.loc[(df_year['play'] > num_views) & (df_year['duration'] > duration)]
    author_counts = df_play_over_num_views['author'].value_counts()
    filtered_authors = author_counts[author_counts > count]
    res_df = df_year[df_year['author'].isin(filtered_authors.index.values)]
    return res_df

In [23]:
# def filter_df_with_rank(df, year, num_views, count, duration, rank):
#     df_year = get_year_data(df, year)
#     df_play_over_num_views = df_year.loc[(df_year['play'] > num_views) & (df_year['duration'] > duration)]
#     author_counts = df_play_over_num_views['author'].value_counts()
#     rank_offset = df.groupby('author')['rank_offset'].median()
#     # 筛选出同时满足条件的author
#     filtered_authors = rank_offset[(rank_offset < rank) & (author_counts > count)].index.tolist()
#     res_df = df_year[df_year['author'].isin(filtered_authors.index.values)]
#     return res_df

In [25]:
def filter_df_with_rank_in_a_year(df, year, num_views, count, duration, rank):
    
    # 算总体的rank_offset 排名 Calculate the median rank offset and author counts
    rank_offset = df.groupby('author')['rank_offset'].median()
    
    # Filter the dataframe based on the year, number of views, and duration all at once
    df_filtered = df[(df['pubdate'].dt.year == year) & (df['play'] > num_views) & (df['duration'] > duration)]
    
    author_counts = df_filtered['author'].value_counts()
    
    # Filter authors based on rank offset and author counts
    filtered_authors = rank_offset[(rank_offset < rank) & (author_counts > count)].index.tolist()
    
    # Filter the dataframe based on the filtered authors
    res_df = df_filtered[df_filtered['author'].isin(filtered_authors)]
    
    return res_df


In [26]:
def filter_df_with_rank_all_years(df, num_views, count, duration, rank):
    
    # 算总体的rank_offset 排名 Calculate the median rank offset and author counts
    rank_offset = df.groupby('author')['rank_offset'].median()
    
    # Filter the dataframe based on the year, number of views, and duration all at once
    df_filtered = df[(df['play'] > num_views) & (df['duration'] > duration)]
    
    author_counts = df_filtered['author'].value_counts()
    
    # Filter authors based on rank offset and author counts
    filtered_authors = rank_offset[(rank_offset < rank) & (author_counts > count)].index.tolist()
    
    # Filter the dataframe based on the filtered authors
    res_df = df_filtered[df_filtered['author'].isin(filtered_authors)]
    
    return res_df


In [28]:
df_rank_700 = filter_df_with_rank_all_years(df_all_cc, 50000, 2, 180, 700)

In [29]:
len(df_rank_700['author'].unique())

28610

In [30]:
type(df_rank_700['mid'].unique())

numpy.ndarray

In [32]:
import json

# 将ndarray转换为Python原生的数据类型
data = df_rank_700['mid'].unique().tolist()

# 将数据转换为JSON字符串
json_data = json.dumps(data)

# 将JSON字符串写入文件
with open('rank_700_mids.json', 'w') as file:
    file.write(json_data)

In [272]:
play_100k_2_2022_authors_r400 = filter_df_with_rank(df_all_cc, 2022, 100000, 2, 300, 400)

In [313]:
play_100k_2_authors_r500 = filter_df_with_rank_all_years(df_all_cc, 100000, 2, 300, 500)

In [309]:
play_100k_2_2022_authors_r500 = filter_df_with_rank(df_all_cc, 2022, 100000, 2, 300, 500)

In [314]:
len(play_100k_2_authors_r500['author'].unique())

8062

In [358]:
def run_all_steps(df, filter_tags_dict, res_file):
    # 添加cate_ids
    df = add_unique_columns(df)
    # 将df中author归类
    tag_group_df = combine_tags_with_author(df)
    # 高频tag去重
    remove_top_tags(tag_group_df, filter_tags_dict)
    # 找相似并保存结果
    find_similar_authors(tag_group_df, res_file)

In [311]:
run_all_steps(play_100k_2_2022_authors_r500, filter_tags_dict_top_3, 'res-100k_2_2022_authors_r500.json')

结果已保存到 res-100k_2_2022_authors_r500.json 文件中


In [317]:
run_all_steps(play_100k_2_authors_r500, filter_tags_dict_top_3, 'res-100k_2_all_authors_r500.json')

结果已保存到 res-100k_2_all_authors_r500.json 文件中


In [187]:
play_100k_2_2022_authors = filter_df(df_all_cc, 2022, 100000, 2, 300)

In [188]:
play_100k_2_2022_authors

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
1,BV1BT4y127ex,http://www.bilibili.com/video/av935381316,我评测过的所有中国车，油管网友竟最喜欢它,2022-01-06 17:23:43,2,车轮哥Wheelsboy,343937521,780287,986,1671,1202,267,"遇见热爱汽车的你,VLOG,日常,五菱,MINIEV,电动汽车,中国车,小车",从2020年到2021年，我拍过数十辆中国车。我已经知道中国网友都喜欢哪款，但是外国网友呢？...,227,0.00154,0.001264,0.002142
2,BV1vZ4y1f7MU,http://www.bilibili.com/video/av380730964,缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》,2022-01-14 10:00:11,3,易车横评,702014294,776740,3248,4220,6766,965,"遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解",在上一集的10万公里测试中，理想ONE的表现还是比较理想的，没想到拆解后，却变得非常不理想，...,227,0.008711,0.004182,0.005433
3,BV1iq4y1c7rH,http://www.bilibili.com/video/av550847490,1年卖上千台车的车贩子告诉你，哪些车千万不能买！,2022-01-18 17:48:50,4,小胡说车,456869112,656350,536,1118,651,325,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,奥迪,记录,知识,汽车,二手车",今天又是大家喜闻乐见的车贩子亏钱专场，快来看下哪些二手车最不值得买！,227,0.000992,0.000817,0.001703
4,BV1yq4y117L6,http://www.bilibili.com/video/av592788930,详细测评魏牌拿铁DHT,2022-01-05 12:00:17,5,38号车评中心官方账号,39736779,617919,4073,17406,2195,1546,"遇见热爱汽车的你,原创,长城,魏派,魏牌,魏,拿铁,suv,测评",这次38号测评了魏牌拿铁DHT，这款车在许多方面相比前代产品有了巨大的改进和提高。具体内容敬请观看,227,0.003552,0.006591,0.028169
6,BV1H5411Z7JT,http://www.bilibili.com/video/av466103346,国民SUV强强对决 哈弗H6对拆长安CS75 PLUS,2022-01-28 11:38:08,7,易车横评,702014294,550549,2102,2583,3392,824,"汽车,汽车测评,哈弗H6,长安CS75 PLUS,拆解,自制",在本集节目里我们将对两部完成拆解的车进行关键位置的比对，看看玄冥二老中谁的结构设计更安全，技...,227,0.006161,0.003818,0.004692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2276469,BV1fG411K76X,http://www.bilibili.com/video/av433341542,早上起床后必须要做的一件事情【Mandell博士】,2022-12-04 17:17:59,975,Mandell博士,1473247194,101034,169,90,1183,281,"记录我的健康生活,健康,健身,技巧,科普,医学,训练,医生,健身爆款创作营",我来到这里来帮助你和你们的家人，我的视频内容是关于临床营养学、神经科学、减肥、缓解疼痛等等。...,164,0.011709,0.001673,0.000891
2276473,BV1TW4y1T7xN,http://www.bilibili.com/video/av946398280,公认的运动之王，深蹲做不了的，你就这样做，一个月后整个人都变牛了,2022-12-18 19:51:42,979,MRs李的瑜伽,2132792873,100900,148,74,3586,332,"2022健身日记,健身,深蹲,训练,减肥,臀腿",-,164,0.03554,0.001467,0.000733
2276476,BV1y8411V7t7,http://www.bilibili.com/video/av221245384,【空腹有氧】超简单全身暴汗尊巴！连跳5遍不瘦不行 跟练：力克体育,2022-12-11 23:59:44,982,屁屁今天有氧了没,3461583300724787,100709,61,20,12306,870,"2022健身日记,坚持自律的每一天,尊巴,暴汗,减脂舞,全身",自用 侵删,164,0.122194,0.000606,0.000199
2276482,BV1gY411U7wb,http://www.bilibili.com/video/av264331403,kpop阳康人进！恢复一二阶段20min散步走！极度温和！玩着恢复！kpop韩团世界名曲！2...,2022-12-26 18:14:03,988,一粒尤初,488575971,100314,216,70,7441,2796,"运动,健身,减肥,新冠,疫情,kpop,exo,blackpink,阳康,newjeans",新年减脂营 报名进入倒数日了！想参加的快冲 尤带你们脱胎换骨\n阳康后2周内不要剧烈运动！ ...,164,0.074177,0.002153,0.000698


In [189]:
len(play_100k_2_2022_authors['author'].unique())

7434

In [186]:
len(filter_df(df_all_cc, 2022, 100000, 2, 300)['author'].unique())

7434

In [67]:
def combine_tags_with_author(df):
    df['tag'] = df['tag'].fillna("empty")
    df_author_group_tag = df.groupby('author').apply(lambda x: pd.Series({'author': x['author'].iloc[0], 'cate_ids': x['cate_ids'].iloc[0], 'mid': x['mid'].iloc[0], 'tag': ','.join(x['tag'])})).reset_index(drop=True)
    return df_author_group_tag

In [57]:
import pandas as pd
from collections import Counter

def tags_count(df):
    df_cc = df.copy()
    # 将tag列中的字符串按照","分割，得到一个包含所有tag的列表
    df_cc['tag'] = df_cc['tag'].str.split(',')

    # 使用groupby函数将相同的cate_id的tag放到一行，并使用Counter函数统计每个tag的频率
    df_grouped = df_cc.groupby('cate_id')['tag'].sum().apply(Counter)
    df_grouped_sort = df_grouped.apply(lambda x: sorted(x.items(), key=lambda item: item[1], reverse=True))
    print('df_grouped_sort:', df_grouped_sort)
    return df_grouped_sort


In [61]:
# 找到每个 cate_id 的前三个元素的 key
def top_n_tags(df, n):
    top_keys = df.apply(lambda x: [key for key, _ in x[:n]])
    filter_dict = top_keys.to_dict()
    return filter_dict

In [190]:
df_sorted_tag_counts = tags_count(play_100k_2_2022_authors)

df_grouped_sort: cate_id
121    [(GMV, 17), (古风, 13), (网络游戏, 12), (剑网3, 10), (...
122    [(技能提升营, 590), (经验分享, 446), (野生技术协会, 419), (技能...
124    [(社会, 1882), (社会观察局, 1238), (热点, 1204), (案件, 9...
126    [(搞笑, 51), (鬼畜调教, 43), (人力VOCALOID, 26), (芜湖大司...
130    [(说唱, 748), (音乐分享官, 738), (音乐, 734), (嘻哈, 681)...
                             ...                        
75     [(动物观察局, 603), (动物圈, 507), (萌宠, 376), (动物圈达人创作...
76     [(美食, 1417), (料理制作, 1137), (厨艺, 1052), (美食vlog...
85     [(创意广告, 7), (创意, 5), (短片, 5), (沙雕, 4), (搞笑, 4)...
86     [(特摄, 675), (假面骑士, 279), (奥特曼, 250), (动漫, 224)...
95     [(科技猎手, 3698), (数码, 1944), (手机, 1772), (电脑, 16...
Name: tag, Length: 109, dtype: object


In [192]:
filter_tags_dict_top_3 = top_n_tags(df_sorted_tag_counts, 3)

In [297]:
filter_tags_dict_top_3

{'121': ['GMV', '古风', '网络游戏'],
 '122': ['技能提升营', '经验分享', '野生技术协会'],
 '124': ['社会', '社会观察局', '热点'],
 '126': ['搞笑', '鬼畜调教', '人力VOCALOID'],
 '130': ['说唱', '音乐分享官', '音乐'],
 '136': ['音游', 'FNF', '周五夜放克'],
 '137': ['明星', '娱乐', '主播'],
 '138': ['搞笑', '搞笑研究所', '沙雕'],
 '152': ['LOVE LIVE!', 'Liella!', '日本'],
 '153': ['中国大陆', '普通话', '热血'],
 '154': ['跳舞', '舞蹈', '搞笑'],
 '156': ['舞蹈教程', '舞蹈教学', '分解教学'],
 '157': ['美妆', '化妆', '时尚'],
 '158': ['服饰', '时尚', '种草'],
 '159': ['时尚', '潮流', '服饰'],
 '161': ['手工', '一起做手工吧！', '手工制作'],
 '162': ['绘画', '画画', '手绘'],
 '164': ['健身', '减肥', '减脂'],
 '168': ['漫画解说', '漫画', '热血'],
 '17': ['单机游戏', '万物皆可游戏', '我的世界'],
 '170': ['中国大陆', '普通话', '战斗'],
 '171': ['电子竞技', '英雄联盟', '王者荣耀'],
 '172': ['原神UP主激励计划', '原神', '手游'],
 '173': ['桌游棋牌', '三国杀', '卡牌游戏'],
 '174': ['助眠', '放松', '触发音'],
 '176': ['汽车', '交通事故', '交通安全'],
 '178': ['英语', '自然', '美国'],
 '179': ['军事', '星海计划', '俄罗斯'],
 '182': ['UP影剧综指南', '电影解说', '影评杂谈'],
 '183': ['影视剪辑', '黑色幽默', '沈腾'],
 '184': ['预告片', '泰版花样男子', '泰版流星花园'],
 '187': 

In [204]:
filter_tags_dict_top_10 = top_n_tags(df_sorted_tag_counts, 10)

In [547]:
def add_unique_columns(df):
    # 创建一个新的 DataFrame，其中包含每个作者的唯一 cate_id 列表
    unique_cate_ids = df.groupby('author')['cate_id'].apply(lambda x: x.unique().tolist()).reset_index()
    unique_cate_ids.columns = ['author', 'cate_ids']
    # 创建一个新的 DataFrame，其中包含每个作者的唯一 title 列表
    unique_titles = df.groupby('author')['title'].apply(lambda x: x.unique().tolist()).reset_index()
    unique_titles.columns = ['author', 'all_titles']
    unique_tags = df.groupby('author')['tag'].apply(lambda x: x.unique().tolist()).reset_index()
    unique_tags.columns = ['author', 'all_tags']
    merge_df = df.merge(unique_cate_ids, on='author', how='left').merge(unique_tags, on='author', how='left').merge(unique_titles, on='author', how='left')
    return merge_df

In [548]:
play_100k_2_2022_authors = add_unique_columns(play_100k_2_2022_authors)

In [551]:
play_100k_2_2022_authors_tag_group

Unnamed: 0,author,cate_ids,mid,tag
0,--圈圈--,"[176, 17, 210, 95]",8784855,"自驾,跑车,开箱,自制,盲盒,汽车,生活,游戏,地平线,单机,自制,MINECRAFT,游戏..."
1,--小卢同学--,"[21, 212, 138, 157, 76]",403366542,"生活万花筒·闪闪发光的日常,我好爱我的生活,烧烤,生活记录,吃货,美食,操作,睡前,常规,情..."
2,--無幻--,[130],108856872,"华语乐坛,精选歌单,经典歌曲,流行音乐,MV,怀旧,听歌,青春,音乐选集,经典,精选歌单,经..."
3,-Emma-Z-,"[198, 199, 158, 29]",472409848,"alienz,kiss me more,doja cat,街头女战士,ygx,Emma,ki..."
4,-LKs-,"[218, 176, 250, 21, 230, 246, 30, 209, 124, 23...",125526,"跟着UP主创作吧（第三期）,今天你云吸猫了吗？,英短,生活,自制,学习,VLOG,宠物,猫,..."
...,...,...,...,...
7429,龙跃文,[157],170292658,"颜值逆袭挑战,跟着B站UP主学化妆,美妆,化妆,时尚,化妆教程,寻找全国最牛Tony老师,染..."
7430,龙飞律师,[124],527145352,"社会观察局,说说心理话,心理,同性,爱情,情感,伴侣,情感,谈恋爱,社会观察局,说说心理话,..."
7431,龙馍馍,[220],11164088,"萌宠星探官·百人百宠大会,动物圈,可爱,搞笑,喵星人,汪星人,沙雕动物,动物迷惑行为,打卡挑..."
7432,龟神小鹏,[238],1959855103,"钓鱼人的快乐时光,野钓,钓鱼,钓鱼技巧,鱼获满满,我在B站做UP主,野钓,钓鱼,垂钓,清道夫..."


In [91]:

for idx, row in sample.iterrows():
    print('author:  ', sample.iloc[idx]['author'])
    current_tags = str(row['tag']).split(',')
    for cate_id in row['cate_ids']:
        # If the cate_id is in the filter_tag dictionary
        if cate_id in filter_tags_dict:
            # Filter out the tags based on the filter_tags_dict
            filtered_tags = [tag for tag in current_tags if tag not in filter_tags_dict_top_5[cate_id]]
            # Update current_tags with the filtered tags for the next iteration
            print('len before filter: ', len(current_tags), 'len after filter: ', len(filtered_tags))
            current_tags = filtered_tags
            
            print('\n')
    # Update the filtered_tags column with the final filtered tags
    sample.at[idx, 'filtered_tags'] = ','.join(current_tags)


author:   --圈圈--
len before filter:  131 len after filter:  129


len before filter:  129 len after filter:  115


len before filter:  115 len after filter:  115


len before filter:  115 len after filter:  103


author:   --小卢同学--
len before filter:  224 len after filter:  196


len before filter:  196 len after filter:  169


len before filter:  169 len after filter:  166


len before filter:  166 len after filter:  153


len before filter:  153 len after filter:  152


author:   -A级英雄提督君-
len before filter:  218 len after filter:  191


author:   -Emma-Z-
len before filter:  546 len after filter:  544


len before filter:  544 len after filter:  472


len before filter:  472 len after filter:  472


len before filter:  472 len after filter:  472


author:   -LKs-
len before filter:  333 len after filter:  333


len before filter:  333 len after filter:  330


len before filter:  330 len after filter:  328


len before filter:  328 len after filter:  321


len before filter:  321 len

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample.at[idx, 'filtered_tags'] = ','.join(current_tags)


In [92]:
sample = play_300k_2022_authors_tag_groups.head(10)
for idx, row in sample.iterrows():
    print('author:  ', sample.iloc[idx]['author'])
    current_tags = str(row['tag']).split(',')
    for cate_id in row['cate_ids']:
        # If the cate_id is in the filter_tag dictionary
        if cate_id in filter_tags_dict:
            # Filter out the tags based on the filter_tags_dict
            filtered_tags = [tag for tag in current_tags if tag not in filter_tags_dict_top_3[cate_id]]
            # Update current_tags with the filtered tags for the next iteration
            print('len before filter: ', len(current_tags), 'len after filter: ', len(filtered_tags))
            current_tags = filtered_tags
            
            print('\n')
    # Update the filtered_tags column with the final filtered tags
    sample.at[idx, 'filtered_tags'] = ','.join(current_tags)


author:   --圈圈--
len before filter:  131 len after filter:  129


len before filter:  129 len after filter:  121


len before filter:  121 len after filter:  121


len before filter:  121 len after filter:  112


author:   --小卢同学--
len before filter:  224 len after filter:  202


len before filter:  202 len after filter:  179


len before filter:  179 len after filter:  177


len before filter:  177 len after filter:  176


len before filter:  176 len after filter:  176


author:   -A级英雄提督君-
len before filter:  218 len after filter:  217


author:   -Emma-Z-
len before filter:  546 len after filter:  544


len before filter:  544 len after filter:  513


len before filter:  513 len after filter:  513


len before filter:  513 len after filter:  513


author:   -LKs-
len before filter:  333 len after filter:  333


len before filter:  333 len after filter:  330


len before filter:  330 len after filter:  328


len before filter:  328 len after filter:  324


len before filter:  324 len

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample.at[idx, 'filtered_tags'] = ','.join(current_tags)


In [100]:
def remove_top_tags(df, filter_tags_dict):
    for idx, row in df.iterrows():
        current_tags = str(row['tag']).split(',')
        for cate_id in row['cate_ids']:
            # If the cate_id is in the filter_tag dictionary
            if cate_id in filter_tags_dict:
                # Filter out the tags based on the filter_tags_dict
                filtered_tags = [tag for tag in current_tags if tag not in filter_tags_dict[cate_id]]
                # Update current_tags with the filtered tags for the next iteration
                current_tags = filtered_tags
                # Update the filtered_tags column with the final filtered tags
                df.at[idx, 'filtered_tags'] = ','.join(current_tags)

In [198]:
play_100k_2_2022_authors_tag_group_cc = play_100k_2_2022_authors_tag_group.copy()

In [205]:
play_100k_2_2022_authors_tag_group_10 = play_100k_2_2022_authors_tag_group.copy()

In [210]:
remove_top_tags(play_100k_2_2022_authors_tag_group_10, filter_tags_dict_top_10)

In [199]:
remove_top_tags(play_100k_2_2022_authors_tag_group_cc, filter_tags_dict_top_3)

In [207]:
play_100k_2_2022_authors_tag_group_10

Unnamed: 0,author,cate_ids,mid,tag,filtered_tags
0,--圈圈--,"[176, 17, 210, 95]",8784855,"自驾,跑车,开箱,自制,盲盒,汽车,生活,游戏,地平线,单机,自制,MINECRAFT,游戏...","自驾,跑车,开箱,自制,盲盒,生活,游戏,地平线,单机,自制,MINECRAFT,游戏,MC..."
1,--小卢同学--,"[21, 212, 138, 157, 76]",403366542,"生活万花筒·闪闪发光的日常,我好爱我的生活,烧烤,生活记录,吃货,美食,操作,睡前,常规,情...","生活万花筒·闪闪发光的日常,我好爱我的生活,烧烤,生活记录,吃货,操作,睡前,常规,情侣,2..."
2,--無幻--,[130],108856872,"华语乐坛,精选歌单,经典歌曲,流行音乐,MV,怀旧,听歌,青春,音乐选集,经典,精选歌单,经...","华语乐坛,精选歌单,经典歌曲,流行音乐,MV,怀旧,听歌,青春,音乐选集,经典,精选歌单,经..."
3,-Emma-Z-,"[198, 199, 158, 29]",472409848,"alienz,kiss me more,doja cat,街头女战士,ygx,Emma,ki...","alienz,kiss me more,doja cat,街头女战士,ygx,Emma,ki..."
4,-LKs-,"[218, 176, 250, 21, 230, 246, 30, 209, 124, 23...",125526,"跟着UP主创作吧（第三期）,今天你云吸猫了吗？,英短,生活,自制,学习,VLOG,宠物,猫,...","跟着UP主创作吧（第三期）,今天你云吸猫了吗？,英短,学习,VLOG,宠物,猫,遇见热爱汽车..."
...,...,...,...,...,...
7429,龙跃文,[157],170292658,"颜值逆袭挑战,跟着B站UP主学化妆,美妆,化妆,时尚,化妆教程,寻找全国最牛Tony老师,染...","颜值逆袭挑战,跟着B站UP主学化妆,化妆教程,寻找全国最牛Tony老师,染发,发色,亢奋,美..."
7430,龙飞律师,[124],527145352,"社会观察局,说说心理话,心理,同性,爱情,情感,伴侣,情感,谈恋爱,社会观察局,说说心理话,...","说说心理话,心理,同性,爱情,情感,伴侣,情感,谈恋爱,说说心理话,婚姻,夫妻,情感,两性,..."
7431,龙馍馍,[220],11164088,"萌宠星探官·百人百宠大会,动物圈,可爱,搞笑,喵星人,汪星人,沙雕动物,动物迷惑行为,打卡挑...","萌宠星探官·百人百宠大会,动物圈,可爱,搞笑,喵星人,汪星人,沙雕动物,动物迷惑行为,打卡挑..."
7432,龟神小鹏,[238],1959855103,"钓鱼人的快乐时光,野钓,钓鱼,钓鱼技巧,鱼获满满,我在B站做UP主,野钓,钓鱼,垂钓,清道夫...","钓鱼人的快乐时光,野钓,钓鱼技巧,鱼获满满,我在B站做UP主,野钓,垂钓,清道夫,目标,我在..."


In [211]:
mean_length = play_100k_2_2022_authors_tag_group_10['filtered_tags'].apply(lambda x: len(x)).mean()
print("字符数量平均值:", mean_length)

字符数量平均值: 761.7175141242938


In [208]:

mean_tag_length = play_100k_2_2022_authors_tag_group_10['tag'].apply(lambda x: len(x)).mean()
print("字符数量平均值:", mean_tag_length)

字符数量平均值: 1051.4035512510088


In [357]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

def find_similar_authors(df, res_file):
    
    # 将逗号分隔的标签转换为空格分隔的标签
    df['filtered_tags'] = df['filtered_tags'].apply(lambda x: ' '.join(str(x).split(',')))

    # 使用TF-IDF计算标签的权重
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['filtered_tags'])

    # 计算标签之间的余弦相似度
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # 为每个作者找到最相似的其他作者
    results = {}
    for i in range(len(df)):
        similar_indices = cosine_sim[i].argsort()[:-33:-1]  # 获取最相似的10个作者的索引
        similar_items = [(cosine_sim[i][j], df['author'][j], int(df['mid'][j])) for j in similar_indices]  # 获取相似度、作者名称和mid
        similar_items = sorted(similar_items, key=lambda x: x[0], reverse=True)  # 根据相似度排序
        results[df['author'][i]] = [{'author': author, 'mid': mid, 'similarity': similarity} for similarity, author, mid in similar_items[1:]]

    # 将结果保存到 JSON 文件
    with open(res_file, 'w') as f:
        json.dump(results, f, indent=4)

    # 打印保存成功信息
    print('结果已保存到', res_file, '文件中')


In [212]:
find_similar_authors(play_100k_2_2022_authors_tag_group_10, 'play_100k_2_2022_filter_top_10.json')

结果已保存到 play_100k_2_2022_filter_top_10.json 文件中


In [251]:
play_100k_2_2022_authors_tag_group_10

Unnamed: 0,author,cate_ids,mid,tag,filtered_tags
0,--圈圈--,"[176, 17, 210, 95]",8784855,"自驾,跑车,开箱,自制,盲盒,汽车,生活,游戏,地平线,单机,自制,MINECRAFT,游戏...",自驾 跑车 自制 盲盒 生活 游戏 地平线 自制 MINECRAFT 游戏 建筑 MINEC...
1,--小卢同学--,"[21, 212, 138, 157, 76]",403366542,"生活万花筒·闪闪发光的日常,我好爱我的生活,烧烤,生活记录,吃货,美食,操作,睡前,常规,情...",生活万花筒·闪闪发光的日常 我好爱我的生活 烧烤 操作 睡前 常规 情侣 2021美食年度大...
2,--無幻--,[130],108856872,"华语乐坛,精选歌单,经典歌曲,流行音乐,MV,怀旧,听歌,青春,音乐选集,经典,精选歌单,经...",华语乐坛 经典歌曲 流行音乐 MV 怀旧 青春 经典 经典 怀旧 影视音乐 循环 华语音乐 ...
3,-Emma-Z-,"[198, 199, 158, 29]",472409848,"alienz,kiss me more,doja cat,街头女战士,ygx,Emma,ki...",alienz kiss me more doja cat 街头女战士 ygx Emma ki...
4,-LKs-,"[218, 176, 250, 21, 230, 246, 30, 209, 124, 23...",125526,"跟着UP主创作吧（第三期）,今天你云吸猫了吗？,英短,生活,自制,学习,VLOG,宠物,猫,...",跟着UP主创作吧（第三期） 今天你云吸猫了吗？ 英短 宠物 遇见热爱汽车的你3.0 我的车日...
...,...,...,...,...,...
7429,龙跃文,[157],170292658,"颜值逆袭挑战,跟着B站UP主学化妆,美妆,化妆,时尚,化妆教程,寻找全国最牛Tony老师,染...",颜值逆袭挑战 跟着B站UP主学化妆 寻找全国最牛Tony老师 染发 发色 亢奋 美发 遮瑕 ...
7430,龙飞律师,[124],527145352,"社会观察局,说说心理话,心理,同性,爱情,情感,伴侣,情感,谈恋爱,社会观察局,说说心理话,...",说说心理话 同性 爱情 伴侣 谈恋爱 说说心理话 婚姻 夫妻 两性 说说心理话 爱情 婚姻 ...
7431,龙馍馍,[220],11164088,"萌宠星探官·百人百宠大会,动物圈,可爱,搞笑,喵星人,汪星人,沙雕动物,动物迷惑行为,打卡挑...",萌宠星探官·百人百宠大会 动物圈 喵星人 汪星人 沙雕动物 动物迷惑行为 打卡挑战 抖肩舞 ...
7432,龟神小鹏,[238],1959855103,"钓鱼人的快乐时光,野钓,钓鱼,钓鱼技巧,鱼获满满,我在B站做UP主,野钓,钓鱼,垂钓,清道夫...",钓鱼人的快乐时光 钓鱼技巧 鱼获满满 垂钓 清道夫 目标 一起记录2022 巨物 刺激 钓鱼...


In [158]:
play_500k_2_300k_5_2022_authors_tag_groups['author'].unique()

array(['胖虎老刘'], dtype=object)

In [157]:
play_500k_2_300k_5_2022_authors_tag_groups

Unnamed: 0,author,cate_ids,mid,tag,filtered_tags
0,胖虎老刘,"[176, 17, 210, 95]",8784855,"自驾,跑车,开箱,自制,盲盒,汽车,生活,游戏,地平线,单机,自制,MINECRAFT,游戏...",自驾 跑车 开箱 自制 盲盒 生活 游戏 地平线 单机 自制 MINECRAFT 游戏 MC...
1,胖虎老刘,"[21, 212, 138, 157, 76]",403366542,"生活万花筒·闪闪发光的日常,我好爱我的生活,烧烤,生活记录,吃货,美食,操作,睡前,常规,情...",生活万花筒·闪闪发光的日常 我好爱我的生活 烧烤 吃货 操作 睡前 常规 情侣 2021美食...
2,胖虎老刘,[47],339602307,"明日方舟创作者应援计划,脑洞,明日方舟,夕,自在,沙雕,明日方舟创作者应援计划第二期,脑洞,...",明日方舟创作者应援计划 明日方舟 夕 自在 沙雕 明日方舟创作者应援计划第二期 刀客塔 要素...
3,胖虎老刘,"[198, 199, 158, 29]",472409848,"alienz,kiss me more,doja cat,街头女战士,ygx,Emma,ki...",alienz kiss me more doja cat 街头女战士 ygx Emma ki...
4,胖虎老刘,"[218, 176, 250, 21, 230, 246, 30, 209, 124, 23...",125526,"跟着UP主创作吧（第三期）,今天你云吸猫了吗？,英短,生活,自制,学习,VLOG,宠物,猫,...",跟着UP主创作吧（第三期） 今天你云吸猫了吗？ 英短 自制 学习 VLOG 宠物 猫 遇见热...
...,...,...,...,...,...
9004,胖虎老刘,[124],527145352,"社会观察局,说说心理话,心理,同性,爱情,情感,伴侣,情感,谈恋爱,社会观察局,说说心理话,...",说说心理话 心理 同性 爱情 伴侣 谈恋爱 说说心理话 婚姻 夫妻 两性 说说心理话 爱情 ...
9005,胖虎老刘,[220],11164088,"萌宠星探官·百人百宠大会,动物圈,可爱,搞笑,喵星人,汪星人,沙雕动物,动物迷惑行为,打卡挑...",萌宠星探官·百人百宠大会 动物圈 可爱 喵星人 汪星人 沙雕动物 动物迷惑行为 打卡挑战 抖...
9006,胖虎老刘,"[220, 138]",1276875126,"搞笑配音,搞笑视频,搞笑动物配音,鳄鱼,豹子,反杀,搞笑视频,搞笑配音,打卡挑战,搞笑配音,...",搞笑配音 搞笑视频 搞笑动物配音 鳄鱼 豹子 反杀 搞笑视频 搞笑配音 打卡挑战 搞笑配音 ...
9007,胖虎老刘,[171],384419683,"KDA,英雄联盟,精彩集锦,电子竞技,阿狸,MOBA,MOBA,精彩集锦,搞笑,英雄联盟,电...",KDA 精彩集锦 阿狸 MOBA MOBA 精彩集锦 娱乐 MOBA 精彩集锦 娱乐 MOB...


In [142]:
filtered_tags = play_500k_2_300k_5_2022_authors_tag_groups.loc[play_500k_2_300k_5_2022_authors_tag_groups['author'] == 'aa', 'filtered_tags'].values[0]
print("filtered_tags:", filtered_tags)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [322]:
find_similar_authors(play_300k_2022_authors_tag_groups, 'res-300k-3-2022-with-mids.json')

结果已保存到 res-300k-3-2022-with-mids.json 文件中


In [255]:
find_similar_authors(play_300k_2022_authors_tag_groups, 'res-300k-3-2022.json')

结果已保存到 res-300k-3-2022.json 文件中


In [297]:
def print_author_tag_frequency(df, author):
    author_tags = df[df['author'] == author]['tag'].str.split(' ')
    tag_count = {}
    for tags in bb:
        for tag in tags:
            if tag in tag_count:
                tag_count[tag] += 1
            else:
                tag_count[tag] = 1
    # 打印每个标签及其出现次数

    import operator
    sorted_tags = sorted(tag_count.items(), key=operator.itemgetter(1), reverse=True)
    for tag, count in sorted_tags:
        print(tag, count)

In [336]:
play_300k_2022_authors_tag_groups.head()

Unnamed: 0,author,mid,tag
0,--圈圈--,8784855,自驾 跑车 开箱 自制 盲盒 汽车 生活 游戏 地平线 单机 自制 MINECRAFT 游戏...
1,--小卢同学--,403366542,生活万花筒·闪闪发光的日常 我好爱我的生活 烧烤 生活记录 吃货 美食 操作 睡前 常规 情...
2,-A级英雄提督君-,339602307,明日方舟创作者应援计划 脑洞 明日方舟 夕 自在 沙雕 明日方舟创作者应援计划第二期 脑洞 ...
3,-Emma-Z-,472409848,alienz kiss me more doja cat 街头女战士 ygx Emma ki...
4,-LKs-,125526,跟着UP主创作吧（第三期） 今天你云吸猫了吗？ 英短 生活 自制 学习 VLOG 宠物 猫 ...


In [93]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

def find_similar_authors_by_tags(df, res_file):
    
    # 将逗号分隔的标签转换为空格分隔的标签
    df['filtered_tags'] = df['filtered_tags'].apply(lambda x: ' '.join(x.split(',')))

    # 使用TF-IDF计算标签的权重
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['filtered_tags'])

    # 计算标签之间的余弦相似度
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # 为每个作者找到最相似的其他作者
    results = {}
    for i in range(len(df)):
        similar_indices = cosine_sim[i].argsort()[:-13:-1]  # 获取最相似的10个作者的索引
        similar_items = [(cosine_sim[i][j], df['author'][j], int(df['mid'][j])) for j in similar_indices]  # 获取相似度、作者名称和mid
        similar_items = sorted(similar_items, key=lambda x: x[0], reverse=True)  # 根据相似度排序
        results[df['author'][i]] = [{'author': author, 'mid': mid, 'similarity': similarity} for similarity, author, mid in similar_items[1:]]

    # 将结果保存到 JSON 文件
    with open(res_file, 'w') as f:
        json.dump(results, f, indent=4)

    # 打印保存成功信息
    print('结果已保存到', res_file, '文件中')

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import json

def find_similar_authors_with_bert(df, res_file):

    # 加载预训练的BERT模型和分词器
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    model = AutoModel.from_pretrained('bert-base-uncased')

    # 获取每个作者标签文本的BERT嵌入表示
    embeddings = []
    for tag in df['tag']:
        inputs = tokenizer(tag, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())  # 将嵌入向量转换为一维数组

    # 计算嵌入表示的余弦相似度
    similarity_matrix = cosine_similarity(embeddings)

    results = {}
    for i in range(len(df)):
        similar_indices = similarity_matrix[i].argsort()[:-13:-1]  # 获取最相似的10个作者的索引
        similar_items = [(similarity_matrix[i][j], df['author'][j]) for j in similar_indices]  # 获取相似度、作者名称
        similar_items = sorted(similar_items, key=lambda x: x[0], reverse=True)  # 根据相似度排序
        results[df['author'][i]] = [{'author': author, 'similarity': similarity} for similarity, author in similar_items[1:]]

    # 将结果保存到 JSON 文件
    with open(res_file, 'w') as f:
        json.dump(results, f, indent=4)

    # 打印保存成功信息
    print('结果已保存到', res_file, '文件中')

# 读取数据
df = pd.read_csv('your_data.csv')



In [360]:
pang = search_videos_by_author(df_all_cc, '丸子Tina')

In [361]:
pang.head(3)

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
278866,BV1NM4y1L7dT,http://www.bilibili.com/video/av932318964,【vlog】8月酷暑下爬山恐怕是疯了吧 | 野外空宅 古藤溪流 香港小岚山,2021-08-15 18:00:14,396,丸子Tina,27694399,159071,219,367,1150,847,"我的夏日活动记录,爬山,VLOG日常,周末,粤语,旅拍,家里有矿,喝有矿更有矿,情侣,香港旅行",最近运动细胞旺盛，我跟笙哥不知天高地厚地在酷暑下去爬山，幸好最后活着回来，发个vlog纪念一...,250,0.007229,0.001377,0.002307
280416,BV1JU4y1A7NU,http://www.bilibili.com/video/av675667486,【vlog】香港秘境探索 有机有氧的生活,2021-09-23 18:38:56,946,丸子Tina,27694399,62799,106,161,381,664,"我的夏日活动记录,VLOG,生活记录,生活,香港,旅游,浮潜,开心,情侣,情侣小日常",之前在动态剧透的vlog终于来啦！香港的自然生态氧吧，让身体皮肤和精神都得到放松！趁着假期大...,250,0.006067,0.001688,0.002564
643670,BV1Aq4y1C7ze,http://www.bilibili.com/video/av593311874,OOTD|英语老师的职场半裙穿搭！好看的衣服总能找到季节穿哈哈哈哈,2022-01-20 18:02:39,438,丸子Tina,27694399,139384,140,101,1586,207,"时尚穿搭,通勤穿搭,职场,裙装,OOTD,春季穿搭",听了大家反馈，用英语拍穿搭。你们喜欢吗！,158,0.011379,0.001004,0.000725


In [153]:
pang['ratio_review'].mean()

0.004331346777589264

In [149]:
ly = search_videos_by_author(play_500k_2_300k_5_2022_authors, '林亦LYi')

In [321]:
df_all_cc.groupby('author')['rank_offset'].median()

In [322]:
max_pubdate

author
--圈圈--      2022-12-08 19:10:00
--小卢同学--    2022-12-31 17:25:00
-A级英雄提督君-   2022-12-31 10:51:09
-Emma-Z-    2022-12-30 14:48:04
-LKs-       2022-11-25 18:00:00
                    ...        
龙飞律师        2022-09-28 20:49:13
龙馍馍         2022-12-31 08:00:00
龙龙哥搞笑配音     2022-12-31 18:22:55
龙龙特烦恼       2022-09-23 09:42:07
누을NuEul     2022-10-24 15:30:00
Name: pubdate, Length: 9009, dtype: datetime64[ns]

In [323]:
pd.Timestamp.now().normalize()

Timestamp('2023-07-06 00:00:00')

In [319]:
play_500k_2_300k_5_2022_authors

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,...,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu,cate_ids,all_tags,all_titles
0,BV1vZ4y1f7MU,http://www.bilibili.com/video/av380730964,缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》,2022-01-14 10:00:11,3,易车横评,702014294,776740,3248,4220,...,965,"遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解",在上一集的10万公里测试中，理想ONE的表现还是比较理想的，没想到拆解后，却变得非常不理想，...,227,0.008711,0.004182,0.005433,"[227, 176, 247, 246, 240]","[遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解, 汽车,...","[缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》, 国民..."
1,BV1iq4y1c7rH,http://www.bilibili.com/video/av550847490,1年卖上千台车的车贩子告诉你，哪些车千万不能买！,2022-01-18 17:48:50,4,小胡说车,456869112,656350,536,1118,...,325,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,奥迪,记录,知识,汽车,二手车",今天又是大家喜闻乐见的车贩子亏钱专场，快来看下哪些二手车最不值得买！,227,0.000992,0.000817,0.001703,"[227, 176, 251, 246, 239, 238]","[2021哔哩哔哩汽车春晚,2021我和爱车的这一年,奥迪,记录,知识,汽车,二手车, 20...","[1年卖上千台车的车贩子告诉你，哪些车千万不能买！, 【年终总结】一年卖873台车！哪些二手..."
2,BV1yq4y117L6,http://www.bilibili.com/video/av592788930,详细测评魏牌拿铁DHT,2022-01-05 12:00:17,5,38号车评中心官方账号,39736779,617919,4073,17406,...,1546,"遇见热爱汽车的你,原创,长城,魏派,魏牌,魏,拿铁,suv,测评",这次38号测评了魏牌拿铁DHT，这款车在许多方面相比前代产品有了巨大的改进和提高。具体内容敬请观看,227,0.003552,0.006591,0.028169,"[227, 176, 246, 124]","[遇见热爱汽车的你,原创,长城,魏派,魏牌,魏,拿铁,suv,测评, 匠人大会云市集,原创,...","[详细测评魏牌拿铁DHT, 详细测评岚图FREE, 测评比亚迪唐DM-i, 详细测试全新汉兰..."
3,BV1H5411Z7JT,http://www.bilibili.com/video/av466103346,国民SUV强强对决 哈弗H6对拆长安CS75 PLUS,2022-01-28 11:38:08,7,易车横评,702014294,550549,2102,2583,...,824,"汽车,汽车测评,哈弗H6,长安CS75 PLUS,拆解,自制",在本集节目里我们将对两部完成拆解的车进行关键位置的比对，看看玄冥二老中谁的结构设计更安全，技...,227,0.006161,0.003818,0.004692,"[227, 176, 247, 246, 240]","[遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解, 汽车,...","[缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》, 国民..."
5,BV1CL4y1t75g,http://www.bilibili.com/video/av850450557,全网横评首测丰田赛那，5款热门七座MPV大比拼，看完还会加价买它吗？,2022-01-03 09:36:39,12,易车横评,702014294,364072,1451,7509,...,2706,"本田,丰田,奥德赛,别克GL8,起亚嘉华,赛那,传祺M8,MPV横评",本期横评车型：丰田赛那、传祺M8、起亚嘉华、本田奥德赛、别克GL8\n往期精彩内容： \nB...,227,0.010517,0.003985,0.020625,"[227, 176, 247, 246, 240]","[遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解, 汽车,...","[缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》, 国民..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344517,BV1fG411K76X,http://www.bilibili.com/video/av433341542,早上起床后必须要做的一件事情【Mandell博士】,2022-12-04 17:17:59,975,Mandell博士,1473247194,101034,169,90,...,281,"记录我的健康生活,健康,健身,技巧,科普,医学,训练,医生,健身爆款创作营",我来到这里来帮助你和你们的家人，我的视频内容是关于临床营养学、神经科学、减肥、缓解疼痛等等。...,164,0.011709,0.001673,0.000891,"[201, 164]","[2022健身FLAG,健康,科普,医生,医学,技巧,健身爆款创作营, 打造我的好身材,健身...","[一个使用双手就有效的简单动作，轻松缓解颈部疼痛和颈部活动受限问题【Mandell】, 尝试..."
344518,BV1fR4y1C7Pp,http://www.bilibili.com/video/av348650649,哇是真的帅,2022-12-12 18:58:59,976,410林杏光KiKi健身,569670486,100981,96,18,...,11,"肌肉,健身,腹肌,肌肉男",-,164,0.004506,0.000951,0.000178,[164],"[肌肉,健身,运动,减脂,塑形, 塑形,肌肉,训练,健身,健美, 健身,减肥,塑形,健美,肌...","[腹部训练，亲妈级别教学，好好听，几个关键点要记住, 这是肩膀训练日收尾动作，重量不用很重,..."
344519,BV17K411i7dP,http://www.bilibili.com/video/av476842663,“我真的很努力了，实在不行，就算了吧”,2022-12-27 16:43:27,978,郭里个Nina,521720756,100920,155,36,...,74,"2022健身日记,坚持自律的每一天,努力,励志,正能量,自律",-,164,0.016072,0.001536,0.000357,[164],"[健身,塑形,运动,健康,日常,生活, 2022健身FLAG,健身,减肥,运动,健康,励志,...","[积极一点，大胆一点，你的人生没那么糟！, 【130斤到100斤】任何事情，只要你想做就可以..."
344520,BV1gY411U7wb,http://www.bilibili.com/video/av264331403,kpop阳康人进！恢复一二阶段20min散步走！极度温和！玩着恢复！kpop韩团世界名曲！2...,2022-12-26 18:14:03,988,一粒尤初,488575971,100314,216,70,...,2796,"运动,健身,减肥,新冠,疫情,kpop,exo,blackpink,阳康,newjeans",新年减脂营 报名进入倒数日了！想参加的快冲 尤带你们脱胎换骨\n阳康后2周内不要剧烈运动！ ...,164,0.074177,0.002153,0.000698,[164],"[减肥操,有氧运动,健身操,塑形瘦身, 打卡挑战,减肥,塑形瘦身,舞蹈,帕梅拉,女团, 健身...",[自用21天瘦10斤！｜初级活力kpop+帕梅拉35min有氧操 跳到tomboy你就赢了！...


In [285]:
search_videos_by_author(df_all_cc, '恐龙帝皇')

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
68312,BV1Yr4y1h7ff,http://www.bilibili.com/video/av766360808,【猎豹/美洲狮】这么大只的猛兽却只会喵喵叫！,2022-02-07 21:02:35,731,恐龙帝皇,71351302,207260,567,714,1313,951,"知识分享官,自然,生物,古生物,猫科,猎豹,美洲狮,大猫,动物,细腰猫",本期介绍一支起源于北美的、却走向了大型化的道路猫亚科，甚至一路返攻回旧大路，与一众强敌同台竞...,218,0.006335,0.002736,0.003445
200741,BV14N411R7NR,http://www.bilibili.com/video/av501602124,【拜年祭正品】逆浪千秋——辐鳍鱼类,2021-02-11 17:00:15,268,恐龙帝皇,71351302,74522,205,139,977,244,"新春音乐颂,MV,音乐,歌曲,鱼,自然,古生物,鱼类,混剪,高燃",祝大家新年快乐！,193,0.01311,0.002751,0.001865
404402,BV1si4y1c7fb,http://www.bilibili.com/video/av543654853,是可爱的威德尔豹豹，它过来了,2021-01-17 11:47:06,516,恐龙帝皇,71351302,12170,41,8,112,8,小视频,-\n是可爱的威德尔豹豹，它过来了,75,0.009203,0.003369,0.000657
837977,BV1Wv411h7ht,http://www.bilibili.com/video/av246994472,【言和】拜年祭重制 《逆浪千秋》——辐鳍鱼类 亿万填词,2021-02-27 07:00:16,176,恐龙帝皇,71351302,32265,65,97,485,244,"拜年祭,填词,中文填词,中文翻唱,2020拜年祭,古风,百万填词,古生物,鱼类,动物",《逆浪千秋》重制版。,30,0.015032,0.002015,0.003006
838853,BV1uy4y1E7rS,http://www.bilibili.com/video/av802113471,【言和】《冠世一战》——旧日支配者 致头足纲,2021-03-09 17:00:18,52,恐龙帝皇,71351302,97583,249,176,1441,285,"VOCALOIDCHINA,填词,中文填词,冠世一战,古风,填词翻唱,动物,古生物,头足纲,...",《冠世一战》献给头足纲。,30,0.014767,0.002552,0.001804
2061835,BV1Lz4y1q7UW,http://www.bilibili.com/video/av571858402,风神翼龙赶走霸王龙合理吗？《史前星球2》,2023-06-02 11:00:00,841,恐龙帝皇,71351302,100750,922,345,839,407,"科普,纪录片,自然,恐龙,霸王龙,古生物,风神翼龙,科学很可爱,史前星球,史前星球2,202...",很多人都在吐槽风神翼龙赶走霸王龙不合理，真的不合理吗？,201,0.008328,0.009151,0.003424
2126842,BV1B34y1z7Mk,http://www.bilibili.com/video/av807801334,两大巅峰食肉恐龙——霸王龙和南方巨兽龙硬件分析,2022-01-01 17:48:51,857,恐龙帝皇,71351302,257461,1007,690,1589,782,"知识分享官,自然,古生物,科学,恐龙,霸王龙,南方巨兽龙,鲨齿龙,牙齿,暴龙",本期对比分析了霸王龙和鲨齿龙亚科的硬件。,201,0.006172,0.003911,0.00268
2127906,BV1Ru411X7rQ,http://www.bilibili.com/video/av509315813,海象能秒杀北极熊？吐槽猛兽大对决,2022-02-26 11:30:10,921,恐龙帝皇,71351302,140570,679,501,932,629,"知识分享官,生物,科普,动物,纪录片,吐槽,猛兽大对决,北极熊,海象,猛兽",本期继续吐槽猛兽大对决。,201,0.00663,0.00483,0.003564
2130798,BV11R4y1P7YF,http://www.bilibili.com/video/av341157147,湾鳄打得过大白鲨？吐槽猛兽大对决,2022-05-03 09:00:00,813,恐龙帝皇,71351302,304197,1071,719,2151,833,"万物研究所,生物,吐槽,纪录片,大白鲨,湾鳄,鲨鱼,海洋生物,鳄鱼,猛兽",本期吐槽猛兽大对决的大白鲨vs湾鳄,201,0.007071,0.003521,0.002364
2136947,BV1xP4y1U7Pn,http://www.bilibili.com/video/av902267555,棘龙能打跑鲨齿龙原来是鲨齿龙太弱，吐槽恐龙星球,2022-11-05 11:00:00,965,恐龙帝皇,71351302,272885,351,430,3402,668,"万物研究所·2022第四期,万物研究所,恐龙,科普,古生物,地球,吐槽,鲨齿龙,棘龙,纪录片...",本期吐槽恐龙星球,201,0.012467,0.001286,0.001576


In [529]:
search_videos_by_author(df_filtered_2021_hot, '果果yo饿了')['rank_offset'].median()

nan

In [525]:
grouped = df_all_cc.groupby('author')

# 有3个热门 + 视频时长 + 最新视频 + 视频播放 + 综合排序
df_filtered_2021_hot = grouped.filter(lambda x: (x['author'].value_counts() > 2).all() and (x['pubdate'].max().year > 2021) and (x['play'].median() > 50000) and (x['duration'].median() > 300) and (x['rank_offset'].median() < 700))
df_filtered_2022_hot = grouped.filter(lambda x: (x['author'].value_counts() > 2).all() and (x['pubdate'].max().year > 2022) and (x['play'].median() > 50000) and (x['duration'].median() > 300) and (x['rank_offset'].median() < 700))



In [528]:
run_all_steps(df_filtered_2021_hot, filter_tags_dict_top_3, 'res-df_filtered_2021_hot_r700.json')
run_all_steps(df_filtered_2022_hot, filter_tags_dict_top_3, 'res-df_filtered_2022_hot_r700.json')

结果已保存到 res-df_filtered_2021_hot_r700.json 文件中
结果已保存到 res-df_filtered_2022_hot_r700.json 文件中


In [530]:
len(df_filtered_2021_hot.groupby('author'))

10714

In [534]:
type(df_filtered_2021_hot.groupby('author')['mid'].unique())

pandas.core.series.Series

In [536]:
import pandas as pd

# 将Series转换为DataFrame
df = df_filtered_2022_hot.groupby('author')['mid'].unique().reset_index(name='mid')

# 将DataFrame保存为JSON文件
df.to_json('new-after-2023-rank-700-7-6.json', orient='records')

In [531]:
search_videos_by_author(df_filtered_2021_hot, '胖虎老刘')['play'].median()

nan

In [523]:
search_videos_by_author(df_filtered, '胖虎老刘')

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
373617,BV1S24y1w7rU,http://www.bilibili.com/video/av782405540,重磅！微软宣布开源Deep Speed Chat，人人拥有ChatGPT,2023-04-13 14:26:57,222,门罗币公主,70581488,89695,740,110,3437,335,"开源,微软,编程,人工智能,AI,Microsoft,ChatGPT来啦,科技猎手2023",,231,0.038319,0.00825,0.001226


In [520]:
search_videos_by_author(df_all_cc, '门罗币公主')['duration'].median()

335.0

In [None]:
# 我认为优质的Up 没有被加进来 大沛沛沛吖 丸子Tina 胖虎老刘


In [293]:
filter_tags_dict_top_5.get('30')

['VOCALOID', '洛天依', '虚拟歌手', '中文VOCALOID', 'ACE虚拟歌姬']

In [245]:
df_all_cc['rank_offset'].quantile(0.)

392.0

In [None]:
#果果yo饿了 林亦LYi 胖虎老刘 奇妙国国 绵羊料理 左一弗利 恐龙帝皇 图灵的猫 

In [326]:
# def filter_df_with_rank(df, year, num_views, count, duration, rank):
    
#     # 算总体的rank_offset 排名 Calculate the median rank offset and author counts
#     rank_offset = df.groupby('author')['rank_offset'].median()
    
#     # Filter the dataframe based on the year, number of views, and duration all at once
#     df_filtered = df[(df['pubdate'].dt.year == year) & (df['play'] > num_views) & (df['duration'] > duration)]
    
#     author_counts = df_filtered['author'].value_counts()
    
#     # Filter authors based on rank offset and author counts
#     filtered_authors = rank_offset[(rank_offset < rank) & (author_counts > count) ].index.tolist()
    
#     # Filter the dataframe based on the filtered authors
#     res_df = df_filtered[df_filtered['author'].isin(filtered_authors)]
    
#     return res_df

def filter_df_with_rank_all_years_and_pubdate(df, num_views, count, duration, rank, num_days):
    
    # 算总体的rank_offset 排名 Calculate the median rank offset and author counts
    rank_offset = df.groupby('author')['rank_offset'].median()
    
    # Filter the dataframe based on the year, number of views, and duration all at once
    df_filtered = df[(df['play'] > num_views) & (df['duration'] > duration)]
    
    latest_pubdate = pd.Timestamp.now() - pd.DateOffset(days=num_days)
    author_counts = df_filtered['author'].value_counts()
    
    # Filter authors based on rank offset and author counts
    filtered_authors = rank_offset[(rank_offset < rank) & (author_counts > count) & (df_filtered['pubdate'] >= latest_pubdate)].index.tolist()
    
    # Filter the dataframe based on the filtered authors
    res_df = df_filtered[df_filtered['author'].isin(filtered_authors)]
    
    return res_df


In [341]:
def filter_df_with_rank_all_years(df, num_views, count, duration, rank):
    
    # 算总体的rank_offset 排名 Calculate the median rank offset and author counts
    rank_offset = df.groupby('author')['rank_offset'].median()
    
    # Filter the dataframe based on the year, number of views, and duration all at once
    df_filtered = df[(df['play'] > num_views) & (df['duration'] > duration)]
    
    author_counts = df_filtered['author'].value_counts()
    
    # Filter authors based on rank offset and author counts
    filtered_authors = rank_offset[(rank_offset < rank) & (author_counts > count)].index.tolist()
    
    # Filter the dataframe based on the filtered authors
    res_df = df_filtered[df_filtered['author'].isin(filtered_authors)]
    
    return res_df

In [445]:
play_100k_2_authors_r500_tt = filter_df_with_rank_all_years(df_all_cc, 100000, 2, 300, 500)

In [449]:
play_100k_2_authors_r600_tt = filter_df_with_rank_all_years(df_all_cc, 100000, 2, 300, 600)

In [461]:
# 计算每个作者的视频中位数时长
median_durations = play_100k_2_authors_r700_tt.groupby('author')['duration'].median()

# 找出中位数时长大于300的作者
authors_with_median_duration_over_300 = median_durations[median_durations > 300].index

# 过滤原始DataFrame，只保留这些作者的视频
play_100k_2_authors_r700_tt_filtered = play_100k_2_authors_r700_tt[play_100k_2_authors_r700_tt['author'].isin(authors_with_median_duration_over_300)]


In [192]:
filter_tags_dict_top_3 = top_n_tags(df_sorted_tag_counts, 3)

In [463]:
len(play_100k_2_authors_r700_tt['author'].unique())

13087

In [424]:
play_100k_2_authors_r500_365

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu


In [330]:
play_100k_2_authors_r500_new

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu


In [333]:
play_100k_2_authors_r500

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
2,BV1vZ4y1f7MU,http://www.bilibili.com/video/av380730964,缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》,2022-01-14 10:00:11,3,易车横评,702014294,776740,3248,4220,6766,965,"遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解",在上一集的10万公里测试中，理想ONE的表现还是比较理想的，没想到拆解后，却变得非常不理想，...,227,0.008711,0.004182,0.005433
3,BV1iq4y1c7rH,http://www.bilibili.com/video/av550847490,1年卖上千台车的车贩子告诉你，哪些车千万不能买！,2022-01-18 17:48:50,4,小胡说车,456869112,656350,536,1118,651,325,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,奥迪,记录,知识,汽车,二手车",今天又是大家喜闻乐见的车贩子亏钱专场，快来看下哪些二手车最不值得买！,227,0.000992,0.000817,0.001703
4,BV1yq4y117L6,http://www.bilibili.com/video/av592788930,详细测评魏牌拿铁DHT,2022-01-05 12:00:17,5,38号车评中心官方账号,39736779,617919,4073,17406,2195,1546,"遇见热爱汽车的你,原创,长城,魏派,魏牌,魏,拿铁,suv,测评",这次38号测评了魏牌拿铁DHT，这款车在许多方面相比前代产品有了巨大的改进和提高。具体内容敬请观看,227,0.003552,0.006591,0.028169
6,BV1H5411Z7JT,http://www.bilibili.com/video/av466103346,国民SUV强强对决 哈弗H6对拆长安CS75 PLUS,2022-01-28 11:38:08,7,易车横评,702014294,550549,2102,2583,3392,824,"汽车,汽车测评,哈弗H6,长安CS75 PLUS,拆解,自制",在本集节目里我们将对两部完成拆解的车进行关键位置的比对，看看玄冥二老中谁的结构设计更安全，技...,227,0.006161,0.003818,0.004692
9,BV1SR4y1K7xu,http://www.bilibili.com/video/av338451912,看到这样的国产车，真好【吉利 星越L】,2022-01-26 21:20:12,10,大家车言论,36044181,449744,1755,2929,1609,794,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,汽车,吉利,星越L,SUV,买车,新车,...",如果没有亲自开上这部车，你大概率会怀疑我们对它的赞誉：这不仅是吉利最高水准的作品，说它是同级...,227,0.003578,0.003902,0.006513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2283518,BV1P5411T7WR,http://www.bilibili.com/video/av461237288,欧洲杯大圣归来！贝尔还能再强一次吗？贝尔中文纪录片《大圣降临》,2021-06-18 19:26:21,235,足球记忆,643958569,119409,251,778,1250,853,"运动点评官,足球,Hi欧洲杯,运动,贝尔,纪录片,热刺,皇马,西甲,英超",更多精彩内容请关注足球记忆！,249,0.010468,0.002102,0.006515
2283520,BV1J64y197VR,http://www.bilibili.com/video/av758792472,每周足坛电子厂！莫拉塔气笑恩里克，塞梅多坑死桑托斯,2021-06-22 10:00:02,237,爱穿小脚裤的花轮,436036217,117121,124,310,303,598,"足球,C罗,欧洲杯,电子厂,莫拉塔,塞梅多,马绍尔,拉莫斯,博格巴,可口可乐",本期BGM：\nTangled Up In Me\nNo Matter\nUndress R...,249,0.002587,0.001059,0.002647
2283529,BV1V44y1z7tD,http://www.bilibili.com/video/av973801160,皇马梦寐以求的男人！欧洲杯天才能否爆发？姆巴佩纪录片《火箭少年》,2021-06-25 19:30:07,246,足球记忆,643958569,111816,344,561,572,550,"运动点评官,足球,欧洲杯,hi欧洲杯,姆巴佩,巴黎,巴黎圣日耳曼,法甲,世界杯",更多精彩内容请关注足球记忆！,249,0.005116,0.003076,0.005017
2283533,BV14y4y1379A,http://www.bilibili.com/video/av803429088,瓜迪奥拉为何使用无腰阵？欧冠决赛 切尔西v曼城比赛分析,2021-06-05 17:32:34,250,愤怒保罗,506831611,108823,508,924,973,1239,"运动点评官,切尔西,图赫尔,欧冠决赛,曼城,瓜迪奥拉,足球,梅西,C罗,巴萨",本影片是2021赛季欧冠决赛的比赛分析\n\n一共分为两部分，第一部分是比赛过程的回顾。第二...,249,0.008941,0.004668,0.008491


In [None]:
import pandas as pd
from datetime import datetime, timedelta

# Assuming df is your dataframe
df['pubdate'] = pd.to_datetime(df['pubdate'])

# Get the date 180 days ago from today
date_180_days_ago = datetime.now() - timedelta(days=180)

# Filter the dataframe
df_filtered = df[df['pubdate'] > date_180_days_ago]

df_filtered

In [352]:
from datetime import datetime, timedelta


date_180_days_ago = datetime.now() - timedelta(days=365)
# Filter the authors whose most recent video is within the last 180 days
recent_authors = play_100k_2_authors_r500[play_100k_2_authors_r500['pubdate'] > date_180_days_ago]['author']

# Finally, select all videos from these authors in the original dataframe
play_100k_2_authors_r500_new = play_100k_2_authors_r500[play_100k_2_authors_r500['author'].isin(recent_authors)]

play_100k_2_authors_r500_new


Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
2,BV1vZ4y1f7MU,http://www.bilibili.com/video/av380730964,缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》,2022-01-14 10:00:11,3,易车横评,702014294,776740,3248,4220,6766,965,"遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解",在上一集的10万公里测试中，理想ONE的表现还是比较理想的，没想到拆解后，却变得非常不理想，...,227,0.008711,0.004182,0.005433
3,BV1iq4y1c7rH,http://www.bilibili.com/video/av550847490,1年卖上千台车的车贩子告诉你，哪些车千万不能买！,2022-01-18 17:48:50,4,小胡说车,456869112,656350,536,1118,651,325,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,奥迪,记录,知识,汽车,二手车",今天又是大家喜闻乐见的车贩子亏钱专场，快来看下哪些二手车最不值得买！,227,0.000992,0.000817,0.001703
4,BV1yq4y117L6,http://www.bilibili.com/video/av592788930,详细测评魏牌拿铁DHT,2022-01-05 12:00:17,5,38号车评中心官方账号,39736779,617919,4073,17406,2195,1546,"遇见热爱汽车的你,原创,长城,魏派,魏牌,魏,拿铁,suv,测评",这次38号测评了魏牌拿铁DHT，这款车在许多方面相比前代产品有了巨大的改进和提高。具体内容敬请观看,227,0.003552,0.006591,0.028169
6,BV1H5411Z7JT,http://www.bilibili.com/video/av466103346,国民SUV强强对决 哈弗H6对拆长安CS75 PLUS,2022-01-28 11:38:08,7,易车横评,702014294,550549,2102,2583,3392,824,"汽车,汽车测评,哈弗H6,长安CS75 PLUS,拆解,自制",在本集节目里我们将对两部完成拆解的车进行关键位置的比对，看看玄冥二老中谁的结构设计更安全，技...,227,0.006161,0.003818,0.004692
9,BV1SR4y1K7xu,http://www.bilibili.com/video/av338451912,看到这样的国产车，真好【吉利 星越L】,2022-01-26 21:20:12,10,大家车言论,36044181,449744,1755,2929,1609,794,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,汽车,吉利,星越L,SUV,买车,新车,...",如果没有亲自开上这部车，你大概率会怀疑我们对它的赞誉：这不仅是吉利最高水准的作品，说它是同级...,227,0.003578,0.003902,0.006513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2283518,BV1P5411T7WR,http://www.bilibili.com/video/av461237288,欧洲杯大圣归来！贝尔还能再强一次吗？贝尔中文纪录片《大圣降临》,2021-06-18 19:26:21,235,足球记忆,643958569,119409,251,778,1250,853,"运动点评官,足球,Hi欧洲杯,运动,贝尔,纪录片,热刺,皇马,西甲,英超",更多精彩内容请关注足球记忆！,249,0.010468,0.002102,0.006515
2283520,BV1J64y197VR,http://www.bilibili.com/video/av758792472,每周足坛电子厂！莫拉塔气笑恩里克，塞梅多坑死桑托斯,2021-06-22 10:00:02,237,爱穿小脚裤的花轮,436036217,117121,124,310,303,598,"足球,C罗,欧洲杯,电子厂,莫拉塔,塞梅多,马绍尔,拉莫斯,博格巴,可口可乐",本期BGM：\nTangled Up In Me\nNo Matter\nUndress R...,249,0.002587,0.001059,0.002647
2283529,BV1V44y1z7tD,http://www.bilibili.com/video/av973801160,皇马梦寐以求的男人！欧洲杯天才能否爆发？姆巴佩纪录片《火箭少年》,2021-06-25 19:30:07,246,足球记忆,643958569,111816,344,561,572,550,"运动点评官,足球,欧洲杯,hi欧洲杯,姆巴佩,巴黎,巴黎圣日耳曼,法甲,世界杯",更多精彩内容请关注足球记忆！,249,0.005116,0.003076,0.005017
2283533,BV14y4y1379A,http://www.bilibili.com/video/av803429088,瓜迪奥拉为何使用无腰阵？欧冠决赛 切尔西v曼城比赛分析,2021-06-05 17:32:34,250,愤怒保罗,506831611,108823,508,924,973,1239,"运动点评官,切尔西,图赫尔,欧冠决赛,曼城,瓜迪奥拉,足球,梅西,C罗,巴萨",本影片是2021赛季欧冠决赛的比赛分析\n\n一共分为两部分，第一部分是比赛过程的回顾。第二...,249,0.008941,0.004668,0.008491


In [353]:
from datetime import datetime, timedelta
date_180_days_ago = datetime.now() - timedelta(days=270)
# Filter the authors whose most recent video is within the last 180 days
recent_authors = play_100k_2_authors_r500[play_100k_2_authors_r500['pubdate'] > date_180_days_ago]['author']

In [354]:
len(play_100k_2_authors_r500['author'].unique())

8062

In [355]:
len(play_100k_2_authors_r500_new['author'].unique())

5611

In [356]:
run_all_steps(play_100k_2_authors_r500_new, filter_tags_dict_top_3, 'res-play_100k_2_authors_r500_new.json')

结果已保存到 res-play_100k_2_authors_r500_new.json 文件中


In [359]:
run_all_steps(play_100k_2_authors_r500_new, filter_tags_dict_top_3, 'res-play_100k_2_authors_r500_new_30.json')

结果已保存到 res-play_100k_2_authors_r500_new_30.json 文件中


In [408]:
play_100k_2_authors_r500['mid'].unique()

array([702014294, 456869112, 39736779, ..., 680200136, 325301815,
       328351154], dtype=object)

In [409]:
type(play_100k_2_authors_r500['mid'].unique())

numpy.ndarray

In [537]:
type(play_100k_2_authors_r500_new)

pandas.core.frame.DataFrame

In [538]:
play_100k_2_authors_r500_new.head()

Unnamed: 0,bvid,arcurl,title,pubdate,rank_offset,author,mid,play,review,video_review,favorites,duration,tag,description,cate_id,ratio_favorites,ratio_review,ratio_danmu
2,BV1vZ4y1f7MU,http://www.bilibili.com/video/av380730964,缸垫渗油严重？防撞梁覆盖率低？理想ONE拆解报告汇总！横评系列之《真十万公里长测》,2022-01-14 10:00:11,3,易车横评,702014294,776740,3248,4220,6766,965,"遇见热爱汽车的你,谁是你心中汽车评测N0.1,理想ONE,汽车,拆车,评测,拆解",在上一集的10万公里测试中，理想ONE的表现还是比较理想的，没想到拆解后，却变得非常不理想，...,227,0.008711,0.004182,0.005433
3,BV1iq4y1c7rH,http://www.bilibili.com/video/av550847490,1年卖上千台车的车贩子告诉你，哪些车千万不能买！,2022-01-18 17:48:50,4,小胡说车,456869112,656350,536,1118,651,325,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,奥迪,记录,知识,汽车,二手车",今天又是大家喜闻乐见的车贩子亏钱专场，快来看下哪些二手车最不值得买！,227,0.000992,0.000817,0.001703
4,BV1yq4y117L6,http://www.bilibili.com/video/av592788930,详细测评魏牌拿铁DHT,2022-01-05 12:00:17,5,38号车评中心官方账号,39736779,617919,4073,17406,2195,1546,"遇见热爱汽车的你,原创,长城,魏派,魏牌,魏,拿铁,suv,测评",这次38号测评了魏牌拿铁DHT，这款车在许多方面相比前代产品有了巨大的改进和提高。具体内容敬请观看,227,0.003552,0.006591,0.028169
6,BV1H5411Z7JT,http://www.bilibili.com/video/av466103346,国民SUV强强对决 哈弗H6对拆长安CS75 PLUS,2022-01-28 11:38:08,7,易车横评,702014294,550549,2102,2583,3392,824,"汽车,汽车测评,哈弗H6,长安CS75 PLUS,拆解,自制",在本集节目里我们将对两部完成拆解的车进行关键位置的比对，看看玄冥二老中谁的结构设计更安全，技...,227,0.006161,0.003818,0.004692
9,BV1SR4y1K7xu,http://www.bilibili.com/video/av338451912,看到这样的国产车，真好【吉利 星越L】,2022-01-26 21:20:12,10,大家车言论,36044181,449744,1755,2929,1609,794,"2021哔哩哔哩汽车春晚,2021我和爱车的这一年,汽车,吉利,星越L,SUV,买车,新车,...",如果没有亲自开上这部车，你大概率会怀疑我们对它的赞誉：这不仅是吉利最高水准的作品，说它是同级...,227,0.003578,0.003902,0.006513


In [539]:
df_user_info = pd.read_json("merged.json")

In [554]:
df_user_info['top_photo']

0       http://i0.hdslb.com/bfs/space/cb1c3ef50e22b609...
1       http://i0.hdslb.com/bfs/space/cb1c3ef50e22b609...
2       http://i2.hdslb.com/bfs/space/cb1c3ef50e22b609...
3       http://i2.hdslb.com/bfs/space/cb1c3ef50e22b609...
4       http://i2.hdslb.com/bfs/space/cb1c3ef50e22b609...
                              ...                        
9523                                                  NaN
9524                                                  NaN
9525                                                  NaN
9526                                                  NaN
9527                                                  NaN
Name: top_photo, Length: 9528, dtype: object

In [None]:
run_all_steps(play_100k_2_authors_r500, filter_tags_dict_top_3, 'res-100k_2_all_authors_r500.json')

In [542]:
play_100k_2_authors_r500.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142724 entries, 2 to 2283557
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   bvid             142724 non-null  object        
 1   arcurl           142724 non-null  object        
 2   title            142724 non-null  object        
 3   pubdate          142581 non-null  datetime64[ns]
 4   rank_offset      142724 non-null  object        
 5   author           142724 non-null  object        
 6   mid              142724 non-null  object        
 7   play             142724 non-null  int64         
 8   review           142724 non-null  int64         
 9   video_review     142724 non-null  object        
 10  favorites        142724 non-null  int64         
 11  duration         142724 non-null  int64         
 12  tag              142714 non-null  object        
 13  description      124787 non-null  object        
 14  cate_id          14

In [None]:
def run_all_steps(df, filter_tags_dict, res_file):
    # 添加cate_ids
    df = add_unique_columns(df)
    # 将df中author归类
    tag_group_df = combine_tags_with_author(df)
    # 高频tag去重
    remove_top_tags(tag_group_df, filter_tags_dict)
    # 找相似并保存结果
    find_similar_authors(tag_group_df, res_file)

In [553]:
filter_tags_dict_top_3

{'121': ['GMV', '古风', '网络游戏'],
 '122': ['技能提升营', '经验分享', '野生技术协会'],
 '124': ['社会', '社会观察局', '热点'],
 '126': ['搞笑', '鬼畜调教', '人力VOCALOID'],
 '130': ['说唱', '音乐分享官', '音乐'],
 '136': ['音游', 'FNF', '周五夜放克'],
 '137': ['明星', '娱乐', '主播'],
 '138': ['搞笑', '搞笑研究所', '沙雕'],
 '152': ['LOVE LIVE!', 'Liella!', '日本'],
 '153': ['中国大陆', '普通话', '热血'],
 '154': ['跳舞', '舞蹈', '搞笑'],
 '156': ['舞蹈教程', '舞蹈教学', '分解教学'],
 '157': ['美妆', '化妆', '时尚'],
 '158': ['服饰', '时尚', '种草'],
 '159': ['时尚', '潮流', '服饰'],
 '161': ['手工', '一起做手工吧！', '手工制作'],
 '162': ['绘画', '画画', '手绘'],
 '164': ['健身', '减肥', '减脂'],
 '168': ['漫画解说', '漫画', '热血'],
 '17': ['单机游戏', '万物皆可游戏', '我的世界'],
 '170': ['中国大陆', '普通话', '战斗'],
 '171': ['电子竞技', '英雄联盟', '王者荣耀'],
 '172': ['原神UP主激励计划', '原神', '手游'],
 '173': ['桌游棋牌', '三国杀', '卡牌游戏'],
 '174': ['助眠', '放松', '触发音'],
 '176': ['汽车', '交通事故', '交通安全'],
 '178': ['英语', '自然', '美国'],
 '179': ['军事', '星海计划', '俄罗斯'],
 '182': ['UP影剧综指南', '电影解说', '影评杂谈'],
 '183': ['影视剪辑', '黑色幽默', '沈腾'],
 '184': ['预告片', '泰版花样男子', '泰版流星花园'],
 '187': 

In [552]:
play_100k_2_2022_authors_tag_group

Unnamed: 0,author,cate_ids,mid,tag
0,--圈圈--,"[176, 17, 210, 95]",8784855,"自驾,跑车,开箱,自制,盲盒,汽车,生活,游戏,地平线,单机,自制,MINECRAFT,游戏..."
1,--小卢同学--,"[21, 212, 138, 157, 76]",403366542,"生活万花筒·闪闪发光的日常,我好爱我的生活,烧烤,生活记录,吃货,美食,操作,睡前,常规,情..."
2,--無幻--,[130],108856872,"华语乐坛,精选歌单,经典歌曲,流行音乐,MV,怀旧,听歌,青春,音乐选集,经典,精选歌单,经..."
3,-Emma-Z-,"[198, 199, 158, 29]",472409848,"alienz,kiss me more,doja cat,街头女战士,ygx,Emma,ki..."
4,-LKs-,"[218, 176, 250, 21, 230, 246, 30, 209, 124, 23...",125526,"跟着UP主创作吧（第三期）,今天你云吸猫了吗？,英短,生活,自制,学习,VLOG,宠物,猫,..."
...,...,...,...,...
7429,龙跃文,[157],170292658,"颜值逆袭挑战,跟着B站UP主学化妆,美妆,化妆,时尚,化妆教程,寻找全国最牛Tony老师,染..."
7430,龙飞律师,[124],527145352,"社会观察局,说说心理话,心理,同性,爱情,情感,伴侣,情感,谈恋爱,社会观察局,说说心理话,..."
7431,龙馍馍,[220],11164088,"萌宠星探官·百人百宠大会,动物圈,可爱,搞笑,喵星人,汪星人,沙雕动物,动物迷惑行为,打卡挑..."
7432,龟神小鹏,[238],1959855103,"钓鱼人的快乐时光,野钓,钓鱼,钓鱼技巧,鱼获满满,我在B站做UP主,野钓,钓鱼,垂钓,清道夫..."


In [555]:
sub_df_user_info = df_user_info[['mid','face','sign','top_photo']]

In [556]:
sub_df_user_info.head()

Unnamed: 0,mid,face,sign,top_photo
0,320204881.0,https://i1.hdslb.com/bfs/face/686650b60e708374...,我是11 谢谢关注！课程咨询eleven3003,http://i0.hdslb.com/bfs/space/cb1c3ef50e22b609...
1,480670986.0,https://i0.hdslb.com/bfs/face/ec3d904a31068a27...,热爱就是在没必要的地方下功夫,http://i0.hdslb.com/bfs/space/cb1c3ef50e22b609...
2,70093.0,https://i1.hdslb.com/bfs/face/d0654b294fe1006b...,weibo@12dora,http://i2.hdslb.com/bfs/space/cb1c3ef50e22b609...
3,84503351.0,https://i2.hdslb.com/bfs/face/ab258a21dcf93ec8...,不定时更新折纸教程，wx: zhezhide12，QQ交流群:2群：868109749,http://i2.hdslb.com/bfs/space/cb1c3ef50e22b609...
4,252996500.0,https://i2.hdslb.com/bfs/face/5c35668c1aff9481...,搞笑整活娱乐游戏up主，希望各位看的开心哦!,http://i2.hdslb.com/bfs/space/cb1c3ef50e22b609...


In [560]:
tag_group_with_user_info = pd.merge(play_100k_2_2022_authors_tag_group, sub_df_user_info, on='mid', how='left')

In [562]:
remove_top_tags(tag_group_with_user_info, filter_tags_dict_top_3)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

def find_similar_authors(df, res_file):
    
    # 将逗号分隔的标签转换为空格分隔的标签
    df['filtered_tags'] = df['filtered_tags'].apply(lambda x: ' '.join(str(x).split(',')))

    # 使用TF-IDF计算标签的权重
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['filtered_tags'])

    # 计算标签之间的余弦相似度
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # 为每个作者找到最相似的其他作者
    results = {}
    for i in range(len(df)):
        similar_indices = cosine_sim[i].argsort()[:-33:-1]  # 获取最相似的10个作者的索引
        similar_items = [(cosine_sim[i][j], df['author'][j], int(df['mid'][j])) for j in similar_indices]  # 获取相似度、作者名称和mid
        similar_items = sorted(similar_items, key=lambda x: x[0], reverse=True)  # 根据相似度排序
        results[df['author'][i]] = [{'author': author, 'mid': mid, 'similarity': similarity} for similarity, author, mid in similar_items[1:]]

    # 将结果保存到 JSON 文件
    with open(res_file, 'w') as f:
        json.dump(results, f, indent=4)

    # 打印保存成功信息
    print('结果已保存到', res_file, '文件中')