In [7]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import requests
from IPython.display import clear_output
from tqdm.notebook import tqdm
import datetime

In [8]:
start_date = '2007-10-06'
end_date = '2021-02-17'

### API抓資料

In [19]:
d_api = {}
d_i = 0 
since_date = start_date
while since_date < end_date:
    
    get_beatmaps = requests.get('https://osu.ppy.sh/api/get_beatmaps?k=13a36d70fd32e2f87fd2a7a89e4f52d54ab337a1&m=0&since='+since_date).json()
    clear_output()
    for i in get_beatmaps:
        d_api[d_i] = i
        d_i = d_i+1
    last_date = i['approved_date']
    since_date = datetime.datetime.strptime(last_date, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
    print(since_date)
    
df_api  = pd.DataFrame.from_dict(d_api, "index")
df_api = df_api.drop_duplicates(['beatmapset_id','beatmap_id'], keep='first') # 去重複
df_api = df_api.loc[df_api.approved_date <= end_date]  # 固定時間
clear_output()

beatmapset_counts = len(df_api.beatmapset_id.value_counts())
first_ranked_date = df_api.head(1).approved_date.values[0]
last_ranked_date = df_api.tail(1).approved_date.values[0]

print('Done!')
print(str(beatmapset_counts)+' mapsets ')
print(str(first_ranked_date) + ' to ' + str(last_ranked_date))

Done!
21388 mapsets 
2007-10-06 17:46:31 to 2021-02-16 23:02:02


### 整理

In [52]:
# Rank & Love
df_rank_map = df_api.loc[(df_api.approved=='1')|(df_api.approved=='4')]
df_rank_map['status'] = df_rank_map.approved.map({'1':'Rank','4':'Love'})
#print(df_rank_map.status.value_counts())


print('clean data....')
# 難度
df_rank_map['difficulty_rating'] = df_rank_map['difficultyrating'].apply(lambda x:'Easy'   if float(x)<2              else(
                                                                                  'Normal' if float(x)>=2   and float(x)<2.7 else(
                                                                                  'Hard'   if float(x)>=2.7 and float(x)<4   else(
                                                                                  'Insane' if float(x)>=4   and float(x)<5.3 else(
                                                                                  'Expert' if float(x)>=5.3 and float(x)<6.5 else'Expert+')))))

# 類別ID轉名稱
df_rank_map['genre_id'] = df_rank_map.genre_id.map({'1':'Unspecified',
                                                    '2':'Video Game',
                                                    '3':'Anime',
                                                    '4':'Rock',
                                                    '5':'Pop',
                                                    '6':'Other',
                                                    '7':'Novelty',
                                                    '8':'Hip Hop',
                                                    '9':'Electronic',
                                                    '10':'Metal',
                                                    '11':'Classical',
                                                    '12':'Folk',
                                                    '13':'Jazz'})

# 語言ID轉名稱
df_rank_map['language_id'] = df_rank_map.language_id.map({'1':'Unspecified',
                                                          '2':'English',
                                                          '3':'Japanese',
                                                          '4':'Chinese',
                                                          '5':'Instrumental',
                                                          '6':'Korean',
                                                          '7':'FrenchItalian',
                                                          '8':'German',
                                                          '9':'Swedish',
                                                          '10':'Spanish',
                                                          '11':'Polish',
                                                          '12':'Russian',
                                                          '14':'Other'})

# 將title和artist的unicode遺失值用英文補齊
df_rank_map['artist_unicode'] = df_rank_map['artist_unicode'].fillna(df_rank_map['artist'])
df_rank_map['title_unicode'] = df_rank_map['title_unicode'].fillna(df_rank_map['title'])

# 類別、語言 補遺失值
df_rank_map['genre_id'] = df_rank_map['genre_id'].fillna('Unspecified')
df_rank_map['language_id'] = df_rank_map['language_id'].fillna('Unspecified')

# 指定所需欄位
df_rank_map_overview = df_rank_map[['beatmapset_id','beatmap_id',
                                    'genre_id','language_id','status',
                                    'title_unicode','artist_unicode',
                                    'approved_date',
                                    'version','difficulty_rating',
                                    'creator_id',
                                    'favourite_count','playcount']] 
# 欄位資料型態
df_rank_map_overview = df_rank_map_overview.astype({'beatmapset_id':'int64',
                                                    'favourite_count':'int64',
                                                    'playcount':'int64'})
df_rank_map_overview['approved_date'] = pd.to_datetime(df_rank_map_overview['approved_date'], format='%Y-%m-%d %H:%M:%S')

# groupby
df_rank_mapset_overview = df_rank_map_overview.groupby('beatmapset_id').agg({'beatmap_id':'count',
                                                                             'status':'min',
                                                                             'genre_id':'min',
                                                                             'language_id':'min',
                                                                             'title_unicode':'min', 
                                                                             'artist_unicode':'min',
                                                                             'creator_id':'min', 
                                                                             'approved_date':'min', 
                                                                             'favourite_count':'min',
                                                                             'playcount':'sum'}).reset_index(drop=False)

clean data....


In [None]:
# API找出麻婆國籍 並統一使用者名稱
def get_country_username(creator_id):
    try:
        q = requests.get('https://osu.ppy.sh/api/get_user?k=13a36d70fd32e2f87fd2a7a89e4f52d54ab337a1&u='+creator_id).json()[0]
        q_country = q.get('country')
        q_username = q.get('username')        
        return [q_country, q_username]
    except:
        return [np.nan, np.nan]


print('add country and username ....')
df_country = df_rank_mapset_overview[['creator_id']].drop_duplicates('creator_id', keep='first')
creator_id_list = df_country.creator_id.to_list()
country_username_list = []
for creator_id in tqdm(creator_id_list):
    country_username_list.append( get_country_username(creator_id) )


df_country_username = pd.DataFrame(country_username_list, columns=['country','mapper'])
df_country = df_country.reset_index( drop=True) 
df_country = pd.concat([df_country, df_country_username], axis=1)

df_rank_mapset_overview = pd.merge(df_rank_mapset_overview, df_country, how='left' ,on='creator_id')

### Done

In [57]:
# 欄位名稱取名
df_rank_mapset_overview = df_rank_mapset_overview.rename(columns={'beatmap_id': 'beatmap_count',
                                                                  'genre_id':'genre',
                                                                  'language_id':'language',
                                                                  'title_unicode':'title', 
                                                                  'artist_unicode':'artist',
                                                                  'creator_id':'mapper_id'
                                                                 }) 
# 縮圖網址
#df_rank_mapset_overview['beatmap_thumbnail'] = df_rank_mapset_overview['beatmapset_id'].apply(lambda x: 'https://b.ppy.sh/thumb/'+str(x)+'l.jpg') 
#df_rank_mapset_overview['mapper_thumbnail'] = df_rank_mapset_overview['mapper_id'].apply(lambda x: 'http://s.ppy.sh/a/'+str(x))

df_rank_mapset_overview = df_rank_mapset_overview[['beatmapset_id','beatmap_count','title','artist',
                                                     'mapper','mapper_id','approved_date','favourite_count','playcount',
                                                     'genre','language','country','status']]  #'beatmap_thumbnail','mapper_thumbnail',
df_rank_mapset_overview

Unnamed: 0,beatmapset_id,beatmap_count,title,artist,mapper,mapper_id,approved_date,favourite_count,playcount,genre,language,country,status
0,1,1,DISCO PRINCE,Kenji Ninuma,peppy,2,2007-10-06 17:46:31,801,477176,Video Game,Japanese,AU,Rank
1,3,4,"1,2,3,4, 007 [Wipeout Series]",Ni-Ni,141,141,2007-10-06 19:32:02,147,509880,Video Game,English,FR,Rank
2,16,1,Love Fighter,Brandy,FFFanatic,677,2007-10-07 13:19:10,69,111375,Video Game,Korean,SG,Rank
3,18,1,Scatman,Scatman John,Extor,555,2007-10-07 22:16:55,698,2048207,Pop,English,AR,Rank
4,23,1,Pop Star,Ken Hirai,chan,94,2007-10-07 22:37:19,81,196928,Pop,Japanese,AU,Rank
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21074,1358581,1,Chasing Cars,Snow Patrol,- Heatwave -,4166621,2021-02-11 05:05:31,28,8335,Rock,English,AU,Rank
21075,1362482,3,大好きだよ大好きだよ 生まれてきてありがとう (TV Size),栗原 雪 (CV: 加隈亜衣) & 桃月 心也 (CV: 岡本信彦),Fia,1269067,2021-02-10 10:46:08,31,16500,Anime,Japanese,CN,Rank
21076,1365122,4,夏の終わり (Cut Ver.),SECONDWALL,Seolv,8067876,2021-02-15 22:44:38,31,19498,Rock,Japanese,RS,Rank
21077,1365314,5,麻痺 (TV Size),yama,Smoke,10726630,2021-02-12 22:06:31,61,64013,Rock,Japanese,US,Rank


In [58]:
df_rank_mapset_overview.to_csv('Mapping_Overview_20210218.csv', index=False, encoding='utf_8_sig')

### 其他統計

In [None]:
df_other = df_rank_map[['beatmapset_id','beatmap_id','status','title_unicode','artist_unicode','version','creator_id','creator','hit_length','bpm','storyboard','diff_size','count_normal','count_slider','count_spinner']]
df_other = df_other.astype({'bpm':'float64','hit_length':'int64','storyboard':'int64','diff_size':'float64','count_normal':'int64','count_slider':'int64','count_spinner':'int64'})
df_other['max_combo'] = df_other['count_normal']+df_other['count_slider']+df_other['count_spinner']
df_other['total_length_class'] = df_other['hit_length'].apply(lambda x:'< 1:39'      if x<99           else(
                                                                           '1:39 ~ 3:29' if x>=99 and x<209 else(
                                                                           '3:29 ~ 5:00' if x>=209 and x<300 else '> 5:00')))
df_other['total_length'] = df_other['hit_length'].apply(lambda x: "%02d:%02d" % (divmod(x, 60)[0], divmod(x, 60)[1]))

In [None]:
df_other_mapset = df_other.groupby('beatmapset_id').agg({'beatmap_id':'count',
                                       'status':'max',
                                       'title_unicode':'max',
                                       'artist_unicode':'max',
                                       'creator_id':'max',
                                       'creator':'max',
                                       'bpm':'max',
                                       'storyboard':'max',
                                       'count_normal':'sum',
                                       'count_slider':'sum',
                                       'count_spinner':'sum',
                                       'total_length_class':'max',
                                       'total_length':'max'}).reset_index(drop=False)
df_other_mapset

In [None]:
# CS10
df_other.loc[df_other.diff_size==10][['beatmapset_id','beatmap_id','title_unicode','artist_unicode','version','creator_id','creator']]

In [None]:
# BPM
df_other.sort_values(by='bpm', ascending=False).head(1)[['beatmapset_id','title_unicode','artist_unicode','creator_id','creator','bpm']]

In [None]:
# max_combo
df_other.sort_values(by='max_combo', ascending=False).head(1)[['beatmapset_id','beatmap_id','title_unicode','artist_unicode','version','creator_id','creator','total_length_class','hit_length','max_combo']]