# Data Clean 2

In [1]:
# std library
import os
from datetime import datetime
from pprint import pprint
from typing import Literal

# third-party
import pandas as pd
from IPython.display import HTML, display

# local
from datafilter import table_filter, table_header
from _data_clean import contest_basic_setting, \
	contest_basic_submission_info, leaderboard_fulfill, \
	select_2_strongest, save_contest_data, Leaderboard_Type

In [2]:
rawdata_path = os.path.abspath('./__rawdata__')

In [None]:
# Download rawdata
if not os.path.exists(rawdata_path):
	import api
	api.download_rawdata(rawdata_path)

In [3]:
# Submissions
# - TeamId
# - SubmissionDate
tbl_submissions = table_filter( \
	'Submissions', rawdata_path,
	fields = ['Id', 'TeamId', 'SubmissionDate', 'IsAfterDeadline',
		'PublicScoreLeaderboardDisplay', 'PrivateScoreFullPrecision'],
	fields_index = ['Id'],
	fields_datetime = ['SubmissionDate'],
)
# drop all submissions after deadlines
tbl_submissions = tbl_submissions[~tbl_submissions['IsAfterDeadline']]
tbl_submissions.drop('IsAfterDeadline', axis=1, inplace=True)
# rename long name
tbl_submissions.rename(columns={
	'PublicScoreLeaderboardDisplay': 'PublicScore',
	'PrivateScoreFullPrecision': 'PrivateScore'
}, inplace=True)
# drop duplicated
## One team can submit only once at a time point
tbl_submissions = tbl_submissions.drop_duplicates(subset=['SubmissionDate', 'TeamId'])

In [4]:
# Mapping: Teams
# - CompetitionId
map_teams_competitions = table_filter( \
	'Teams', rawdata_path,
	fields = ['Id', 'CompetitionId'],
	fields_index = ['Id'],
)

In [5]:
# Merge `Submissions` and `TeamsCompetition`
tbl_submissions = pd.merge(tbl_submissions, map_teams_competitions,
	how = 'left', left_on = 'TeamId', right_on = 'Id', sort = False,
	suffixes=('', '_Team'))
tbl_submissions.drop(columns='Id_Team', inplace=True)
tbl_submissions.dtypes

Id                         int64
TeamId                     int64
SubmissionDate    datetime64[ns]
PublicScore              float64
PrivateScore             float64
CompetitionId              int64
dtype: object

In [6]:
# Create Table of Contests
tbl_contests = table_filter( \
	'Competitions', rawdata_path,
	fields=['Id',
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate',
		'HasLeaderboard', 'LeaderboardPercentage', 'MaxDailySubmissions',
		'RewardType', 'RewardQuantity', 'NumPrizes',
		'FinalLeaderboardHasBeenVerified', 'EvaluationAlgorithmName', 'Overview', 'Rules'
	],
	fields_datetime=[
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate'
	],
	fields_index=['Id'],
)
len(tbl_contests)

9446

In [7]:
# Split contests with prize type
lst_contest_Usd = tbl_contests.loc[tbl_contests['RewardType']=='USD']['Id']
lst_contest_Knowledge = tbl_contests.loc[tbl_contests['RewardType']=='Knowledge']['Id']
lst_contest_Swag = tbl_contests.loc[tbl_contests['RewardType']=='Swag']['Id']
lst_contest_Kudos = tbl_contests.loc[tbl_contests['RewardType']=='Kudos']['Id']
lst_contest_EUR = tbl_contests.loc[tbl_contests['RewardType']=='EUR']['Id']

# Split contests with prize number
lst_contest_1_prize = tbl_contests.loc[tbl_contests['NumPrizes']==1]['Id']
lst_contest_2_prize = tbl_contests.loc[tbl_contests['NumPrizes']==2]['Id']
lst_contest_3_prize = tbl_contests.loc[tbl_contests['NumPrizes']==3]['Id']
lst_contest_4_prize = tbl_contests.loc[tbl_contests['NumPrizes']==4]['Id']
lst_contest_5_prize = tbl_contests.loc[tbl_contests['NumPrizes']==5]['Id']
lst_contest_6_prize = tbl_contests.loc[tbl_contests['NumPrizes']==6]['Id']
lst_contest_7_prize = tbl_contests.loc[tbl_contests['NumPrizes']==7]['Id']
lst_contest_8_prize = tbl_contests.loc[tbl_contests['NumPrizes']==8]['Id']
lst_contest_9_prize = tbl_contests.loc[tbl_contests['NumPrizes']==9]['Id']

In [8]:
def save_to_device(
		contest_id: int,
		leaderboard_type: Leaderboard_Type,
		prize: float | None = None
):
	deadline, total_prize, max_daily_submit, percentage = \
			contest_basic_setting(tbl_contests, contest_id)
	if prize is None:
		prize = total_prize
	assert prize is not None
	players = \
			select_2_strongest(tbl_submissions, contest_id, deadline, leaderboard_type)
	if players is None:
		print(f'Warning: cannot find 2 players in contest {contest_id}')
		return
	else:
		player_i, player_j = players
	save_contest_data( \
		tbl_submissions, contest_id, player_i, player_j,
		deadline, prize, max_daily_submit, percentage, leaderboard_type)

### 1. Select contest providing 1 single USD prize

In [9]:
list_percentage_leaderboard = [
	2435, 2445, 2454, 2464, 2467, 2478, 2549, 2762, 2860, 3507, 3526, 3928, 4493,
]
for contest_id in list_percentage_leaderboard:
	save_to_device(contest_id, 'Percentage_Small')



### 2. For those Contests with 2 Prizes

In [10]:
contest_2prize_gap = {
	2489: 100,
	2499: 2000,
	3023: 4500,
	3353: 6000,
	3366: 200,
	3471: 200,
	3477: 200,
	3509: 200,
	4066: 5000
}

In [11]:
list_percentage_leaderboard = [
	2489, 3353, 3366, 3509,
]
for contest_id in list_percentage_leaderboard:
	save_to_device(contest_id, 'Percentage_Small')

### 3. For those Contests with 3  Prizes

In [None]:
contest_3prize_gap = {
	2509: 3000,  # normal
	2518: 3000,  # normal
	2551: 1500,  # small
	2602: 2000,
	2606: 1500,
	2609: 1000,
	2667: 30000,  # small
	2732: 5000,
	2748: 3000,  # normal
	2749: 3000,  # small
	2780: 4000,
	2840: 10000,
	2888: 2000,
	2917: 3000,  # normal
	2969: 6500,
	2975: 12000,  # normal
	2984: 5000,  # normal
	3043: 5000,
	3046: 5000,  # normal
	3064: 200,  # small
	3080: 90,  # small
	3084: 7000,
	3175: 6000,
	3288: 1000,  # small
	3316: 4000,
	3338: 1500,  # small
	3342: 1000,
	3354: 200,
	3517: 100,  # small
	3599: 1000,
	3641: 1500,  # small
	3756: 2000,
	3772: 2500,
	3774: 500,  # small
	3800: 3000,  # small
	3887: 3000,  # normal
	3926: 300,   # small
	3929: 8000,  # small
	3934: 6000,
	3951: 4000,  # normal
	3960: 8000,  # small
	3966: 3000,
	3978: 55000,
	4031: 5000,  # small
	4043: 200,  # small
	4104: 20000,  # small
	4117: 9000,
	4120: 7000,
	4272: 5000,
	4280: 2000,
	4366: 8000,  # small
	4407: 4000,  # small
	4438: 4000,
	4453: 2000,  # small
	4467: 5000,
	4471: 4000,  # normal
	4477: 2000,  # small
	4481: 15000,  # normal
	4488: 2000,  # small
	4521: 2000,
	4571: 30000,
	4594: 5000,
	4657: 4000,  # small
	4699: 5000,  # small
	4729: 75000,
	4852: 5000,
	4853: 8000,
	4986: 10000,  # small
	5048: 10000,
	5056: 5000,  # small
	5144: 20000,  # small
	5174: 3000,  # small
	5229: 15000,  # small
	5260: 4000,
	5261: 10000,  # small
	5340: 5000,
	5357: 5000,  # small
	5390: 4000,  # small
	5497: 4000,  # small
	5558: 10000,
	5916: 20000,  # normal
	6116: 4000,
	6243: 30000,
	6277: 4000,
	6322: 10000,  # small
	6392: 4000,
	6565: 4000,
	6644: 4000,  # normal
	6768: 4000,
	6841: 7000,
	6927: 4000,  # small
	7042: 4000,
	7043: 4000,
	7082: 4000,  # normal
	7115: 4000,  # small
	7162: 1000,  # small
	7163: 1000,
	7277: 4000,
	7380: 10000,
	7391: 5000,
	7456: 500,
	7559: 30000,
	7634: 2000,  # small
	7878: 4000,  # small
	8076: 6000,  # small
	8078: 4000,  # small
	8219: 400,   # small
	8220: 400,
	8310: 10000,
	8311: 10000,
	8396: 500,  # small
	8540: 5000,  # small
	8586: 4000,
	8899: 400,
	9120: 10000,  # small
	9949: 5000,  # small
	9951: 3000,
	9988: 10000,
	9993: 4000,
	10038: 3000,
	10200: 4000,  # small
	10384: 4000,
	10684: 4000,  # small
	10700: 1000,
	10733: 2000,
	10737: 4000,
	11836: 4000,
	12797: 4000,
	12863: 4000,
	13333: 2000,  # small
	14239: 1000,
	14242: 3000,  # small
	14420: 20000,  # small
	14897: 3000,
	15696: 35000,
	15768: 4000,
	16245: 4000,
	16295: 2000,
	17233: 2000,
	18045: 4000,  # small
	18329: 1000,
	18647: 4000,
	19018: 8000,  # small
	19231: 4000,
	19233: 4000,
	19596: 4000,
	19989: 4000,
	19991: 4000,  # small
	20270: 4000,  # small
	20604: 15000,
	21669: 1000,  # small
	21723: 1000,  # normal
	22111: 4000,
	22422: 10000,
	22559: 2000,
	22962: 4000,  # small
	23249: 1000,  # small
	23652: 1000,  # small
	23823: 4000,
	24286: 5000,
	24800: 6000,
	25383: 4000,
	25954: 1000,
	26933: 2000,
	27783: 3000,
	27923: 4000,
	29594: 100,
	33246: 1000,
	37077: 4000,  # small
	37190: 4000,
	37244: 4000,
	37333: 2000,
	38128: 8000,  # small
	38257: 6000,
	38760: 5000,  # small
	59291: 5000,
}

In [None]:
list_percentage_leaderboard = [
	2551, 2667, 2749, 3064, 3080, 3288, 3338, 3517, 3641, 3774, 3800, 3926, 3929,
	3960, 4031, 4043, 4104, 4366, 4407, 4453, 4477, 4488, 4657, 4699, 4986, 5056,
	5144, 5174, 5229, 5261, 5357, 5390, 5497, 6322, 6927, 7115, 7162, 7634, 7878,
	8076, 8078, 8219, 8396, 8540, 9120, 9949, 10200, 10684, 13333, 14242, 14420,
	18045, 19018, 19991, 20270, 21669, 22962, 23249, 23652, 37077, 38128, 38760,
]
for contest_id in list_percentage_leaderboard:
	save_to_device(contest_id, 'Percentage_Small', prize=contest_3prize_gap[contest_id])

