In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import requests
from IPython.display import clear_output
from tqdm.notebook import tqdm
import datetime

In [2]:
start_date = '2020-01-01'
end_date = '2021-01-01'

### API抓資料

In [3]:
df_api = pd.DataFrame()
since_date = start_date
while since_date <= end_date:
    
    get_beatmaps = requests.get('https://osu.ppy.sh/api/get_beatmaps?k=13a36d70fd32e2f87fd2a7a89e4f52d54ab337a1&m=0&since='+since_date).json()
    clear_output()
    for i in get_beatmaps:
        beatmap = pd.DataFrame.from_dict(i, orient='index').T
        df_api = df_api.append(beatmap)
    last_date = df_api.tail(1).approved_date.values[0]
    since_date = datetime.datetime.strptime(last_date, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
    print(since_date)
    
df_api = df_api.drop_duplicates(['beatmapset_id','beatmap_id'], keep='first') # 去重複
df_api = df_api.loc[df_api.approved_date <= end_date]  # 固定時間
clear_output()

beatmapset_counts = len(df_api.beatmapset_id.value_counts())
first_ranked_date = df_api.head(1).approved_date.values[0]
last_ranked_date = df_api.tail(1).approved_date.values[0]

print('Done!')
print(str(beatmapset_counts)+' mapsets ')
print(str(first_ranked_date) + ' to ' + str(last_ranked_date))

Done!
2339 mapsets 
2020-01-01 00:20:45 to 2020-12-31 20:02:17


### 整理

In [4]:
# Rank & Love
df_rank_map = df_api.loc[(df_api.approved=='1')|(df_api.approved=='4')]
df_rank_map['status'] = df_rank_map.approved.map({'1':'Rank','4':'Love'})
print(df_rank_map.status.value_counts())


print('clean data....')
# 難度
df_rank_map['difficulty_rating'] = df_rank_map['difficultyrating'].apply(lambda x:'Easy'   if float(x)<2              else(
                                                                                  'Normal' if float(x)>=2   and float(x)<2.7 else(
                                                                                  'Hard'   if float(x)>=2.7 and float(x)<4   else(
                                                                                  'Insane' if float(x)>=4   and float(x)<5.3 else(
                                                                                  'Expert' if float(x)>=5.3 and float(x)<6.5 else'Expert+')))))

# 類別ID轉名稱
df_rank_map['genre_id'] = df_rank_map.genre_id.map({'1':'Unspecified',
                                                    '2':'Video Game',
                                                    '3':'Anime',
                                                    '4':'Rock',
                                                    '5':'Pop',
                                                    '6':'Other',
                                                    '7':'Novelty',
                                                    '8':'Hip Hop',
                                                    '9':'Electronic',
                                                    '10':'Metal',
                                                    '11':'Classical',
                                                    '12':'Folk',
                                                    '13':'Jazz'})

# 語言ID轉名稱     #語言缺少 7、9、11、13 ('FrenchItalian'、'Swedish'、'Polish'、'Unspecified')
df_rank_map['language_id'] = df_rank_map.language_id.map({'2':'English',
                                                          '3':'Japanese',
                                                          '4':'Chinese',
                                                          '5':'Instrumental',
                                                          '6':'Korean',
                                                          '8':'German',
                                                          '10':'Spanish',
                                                          '12':'Russian',
                                                          '14':'Other'})



# 指定所需欄位
df_rank_map_overview = df_rank_map[['beatmapset_id','beatmap_id',
                                    'genre_id','language_id','status',
                                    'title_unicode','artist_unicode',
                                    'approved_date',
                                    'version','difficulty_rating',
                                    'creator_id',
                                    'favourite_count','playcount']] 
# 欄位資料型態
df_rank_map_overview = df_rank_map_overview.astype({'beatmapset_id':'int64',
                                                    'favourite_count':'int64',
                                                    'playcount':'int64'})
df_rank_map_overview['approved_date'] = pd.to_datetime(df_rank_map_overview['approved_date'], format='%Y-%m-%d %H:%M:%S')

# groupby
df_rank_mapset_overview = df_rank_map_overview.groupby('beatmapset_id').agg({'beatmap_id':'count',
                                                                             'status':'min',
                                                                             'genre_id':'min',
                                                                             'language_id':'min',
                                                                             'title_unicode':'min', 
                                                                             'artist_unicode':'min',
                                                                             'creator_id':'min', 
                                                                             'approved_date':'min', 
                                                                             'favourite_count':'min',
                                                                             'playcount':'sum'}).reset_index(drop=False)


# API找出麻婆國籍  (待優化)
tqdm.pandas()

def get_country(creator_id):
    try:
        q = requests.get('https://osu.ppy.sh/api/get_user?k=13a36d70fd32e2f87fd2a7a89e4f52d54ab337a1&u='+creator_id).json()[0].get('country')
        return q
    except:
        return
    
def get_username(creator_id):
    try:
        q = requests.get('https://osu.ppy.sh/api/get_user?k=13a36d70fd32e2f87fd2a7a89e4f52d54ab337a1&u='+creator_id).json()[0].get('username') 
        return q
    except:
        return

df_country = df_rank_mapset_overview[['creator_id']].drop_duplicates('creator_id', keep='first')
print('add country....')
df_country['country'] = df_country['creator_id'].progress_apply(lambda x: get_country(x)) 

print('add mapper name....')
df_country['mapper'] = df_country['creator_id'].progress_apply(lambda x: get_username(x)) 


df_rank_mapset_overview = pd.merge(df_rank_mapset_overview, df_country, how='left' ,on='creator_id')

Rank    8548
Love     172
Name: status, dtype: int64
clean data....
add country....


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=713.0), HTML(value='')))


add mapper name....


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=713.0), HTML(value='')))




### Done

In [5]:
# 欄位名稱取名
df_rank_mapset_overview = df_rank_mapset_overview.rename(columns={'beatmap_id': 'beatmap_count',
                                                                  'genre_id':'genre',
                                                                  'language_id':'language',
                                                                  'title_unicode':'title', 
                                                                  'artist_unicode':'artist',
                                                                  'creator_id':'mapper_id'
                                                                 }) 
# 縮圖網址
df_rank_mapset_overview['beatmap_thumbnail'] = df_rank_mapset_overview['beatmapset_id'].apply(lambda x: 'https://b.ppy.sh/thumb/'+str(x)+'l.jpg') 
df_rank_mapset_overview['mapper_thumbnail'] = df_rank_mapset_overview['mapper_id'].apply(lambda x: 'http://s.ppy.sh/a/'+str(x))

df_rank_mapset_overview = df_rank_mapset_overview[['beatmapset_id','beatmap_count','title','artist',
                                                     'mapper','mapper_id','approved_date','favourite_count','playcount',
                                                     'genre','language','country','beatmap_thumbnail','mapper_thumbnail','status']]
df_rank_mapset_overview

Unnamed: 0,beatmapset_id,beatmap_count,title,artist,mapper,mapper_id,approved_date,favourite_count,playcount,genre,language,country,beatmap_thumbnail,mapper_thumbnail,status
0,27996,6,All The Small Things,blink-182,Krisom,99269,2020-12-21 07:46:01,72,42063,Rock,English,CL,https://b.ppy.sh/thumb/27996l.jpg,http://s.ppy.sh/a/99269,Rank
1,43237,2,Remotely Combat,IRON ATTACK!,C R E A M,697649,2020-12-25 14:04:29,43,25724,Classical,Instrumental,KR,https://b.ppy.sh/thumb/43237l.jpg,http://s.ppy.sh/a/697649,Rank
2,45031,1,neu,wac,Nakagawa-Kanon,87065,2020-05-29 11:12:05,110,12762,Video Game,English,KR,https://b.ppy.sh/thumb/45031l.jpg,http://s.ppy.sh/a/87065,Love
3,49351,5,Celebrity Status,Marianas Trench,Krisom,99269,2020-08-06 12:25:14,67,67352,Rock,English,CL,https://b.ppy.sh/thumb/49351l.jpg,http://s.ppy.sh/a/99269,Rank
4,51292,1,こわれかけのオルゴール,佐藤ひろ美,kanpakyin,394326,2020-02-10 02:21:57,15,9018,Anime,Japanese,HK,https://b.ppy.sh/thumb/51292l.jpg,http://s.ppy.sh/a/394326,Rank
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2333,1320333,3,ワールドイズマイン,supercell feat. 初音ミク,celerih,4696296,2020-12-23 10:03:57,302,161445,Pop,Japanese,CA,https://b.ppy.sh/thumb/1320333l.jpg,http://s.ppy.sh/a/4696296,Rank
2334,1322692,4,Santa Claus Is Coming to Town,Cö shu Nie,GlaZe,8009626,2020-12-31 07:46:01,13,16969,Pop,English,SE,https://b.ppy.sh/thumb/1322692l.jpg,http://s.ppy.sh/a/8009626,Rank
2335,1323628,1,December (feat. Davey Havok),Seven Lions,Altai,5745865,2020-12-28 01:20:36,51,14500,Metal,English,GB,https://b.ppy.sh/thumb/1323628l.jpg,http://s.ppy.sh/a/5745865,Rank
2336,1324247,6,I Wanna Guinea Pig For Christmas,Parry Gripp,Sotarks,4452992,2020-12-27 01:42:58,120,288609,Rock,English,FR,https://b.ppy.sh/thumb/1324247l.jpg,http://s.ppy.sh/a/4452992,Rank


In [None]:
df_rank_mapset_overview.to_csv('2020_Mapping_Overview_20210112.csv', index=False, encoding='utf_8_sig')

### 其他統計

In [None]:
df_other = df_rank_map[['beatmapset_id','beatmap_id','status','title_unicode','artist_unicode','version','creator_id','creator','hit_length','bpm','storyboard','diff_size','count_normal','count_slider','count_spinner']]
df_other = df_other.astype({'bpm':'float64','hit_length':'int64','storyboard':'int64','diff_size':'float64','count_normal':'int64','count_slider':'int64','count_spinner':'int64'})
df_other['max_combo'] = df_other['count_normal']+df_other['count_slider']+df_other['count_spinner']
df_other['total_length_class'] = df_other['hit_length'].apply(lambda x:'< 1:39'      if x<99           else(
                                                                           '1:39 ~ 3:29' if x>=99 and x<209 else(
                                                                           '3:29 ~ 5:00' if x>=209 and x<300 else '> 5:00')))
df_other['total_length'] = df_other['hit_length'].apply(lambda x: "%02d:%02d" % (divmod(x, 60)[0], divmod(x, 60)[1]))

In [None]:
df_other_mapset = df_other.groupby('beatmapset_id').agg({'beatmap_id':'count',
                                       'status':'max',
                                       'title_unicode':'max',
                                       'artist_unicode':'max',
                                       'creator_id':'max',
                                       'creator':'max',
                                       'bpm':'max',
                                       'storyboard':'max',
                                       'count_normal':'sum',
                                       'count_slider':'sum',
                                       'count_spinner':'sum',
                                       'total_length_class':'max',
                                       'total_length':'max'}).reset_index(drop=False)
df_other_mapset

In [None]:
# CS10
df_other.loc[df_other.diff_size==10][['beatmapset_id','beatmap_id','title_unicode','artist_unicode','version','creator_id','creator']]

In [None]:
# BPM
df_other.sort_values(by='bpm', ascending=False).head(1)[['beatmapset_id','title_unicode','artist_unicode','creator_id','creator','bpm']]

In [None]:
# max_combo
df_other.sort_values(by='max_combo', ascending=False).head(1)[['beatmapset_id','beatmap_id','title_unicode','artist_unicode','version','creator_id','creator','total_length_class','hit_length','max_combo']]