# Explore `meta-kaggle` Dataset

## --- Submissions

In [1]:
# std library
import os
from datetime import datetime
from pprint import pprint
from typing import Literal

# third-party
import pandas as pd
from IPython.display import HTML, display

# local
from datafilter import table_filter, table_header
from _data_clean import contest_basic_setting, \
	contest_basic_submission_info, leaderboard_fulfill, \
	select_2_strongest, save_contest_data, Leaderboard_Type

In [2]:
rawdata_path = os.path.abspath('./__rawdata__')

Download RawData: 

1. Install Python package `kaggle`
2. [Create token](https://www.kaggle.com/docs/api) and 
3. Put the `kaggle.json` file to the `.kaggle` folder of your home directory
4. Download dataset `meta-kaggle` if haven't yet, by excuting: 

In [None]:
if not os.path.exists(rawdata_path):
	import api
	api.download_rawdata(rawdata_path)

Dataset URL: https://www.kaggle.com/datasets/kaggle/meta-kaggle


### 1. Mapping Datasets

Create a dataset of submissions, containing the following fields:
- Id 
- TeamId 
- CompetitionId 
- SubmissionDate
- PublicScore
- PrivateScore

In [None]:
# Submissions
# - TeamId
# - SubmissionDate
tbl_submissions = table_filter( \
	'Submissions', rawdata_path,
	fields = ['Id', 'TeamId', 'SubmissionDate', 'IsAfterDeadline',
		'PublicScoreLeaderboardDisplay', 'PrivateScoreFullPrecision'],
	fields_index = ['Id'],
	fields_datetime = ['SubmissionDate'],
)
# drop all submissions after deadlines
tbl_submissions = tbl_submissions[~tbl_submissions['IsAfterDeadline']]
tbl_submissions.drop('IsAfterDeadline', axis=1, inplace=True)
# rename long name
tbl_submissions.rename(columns={
	'PublicScoreLeaderboardDisplay': 'PublicScore',
	'PrivateScoreFullPrecision': 'PrivateScore'
}, inplace=True)
# drop duplicated
## One team can submit only once at a time point
tbl_submissions = tbl_submissions.drop_duplicates(subset=['SubmissionDate', 'TeamId'])

In [None]:
# Mapping: Teams
# - CompetitionId
map_teams_competitions = table_filter( \
	'Teams', rawdata_path,
	fields = ['Id', 'CompetitionId'],
	fields_index = ['Id'],
)

In [None]:
# Merge `Submissions` and `TeamsCompetition`
tbl_submissions = pd.merge(tbl_submissions, map_teams_competitions,
	how = 'left', left_on = 'TeamId', right_on = 'Id', sort = False,
	suffixes=('', '_Team'))
tbl_submissions.drop(columns='Id_Team', inplace=True)
tbl_submissions.dtypes

In [None]:
# All fields in `Competitions`
table_header('Competitions', rawdata_path)

In [None]:
# Create Table of Contests
tbl_contests = table_filter( \
	'Competitions', rawdata_path,
	fields=['Id',
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate',
		'HasLeaderboard', 'LeaderboardPercentage', 'MaxDailySubmissions',
		'RewardType', 'RewardQuantity', 'NumPrizes',
		'FinalLeaderboardHasBeenVerified', 'EvaluationAlgorithmName', 'Overview', 'Rules'
	],
	fields_datetime=[
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate'
	],
	fields_index=['Id'],
)
len(tbl_contests)

In [None]:
def save_to_device(
		contest_id: int,
		leaderboard_type: Leaderboard_Type,
		prize: float | None = None
):
	deadline, total_prize, max_daily_submit, percentage = \
			contest_basic_setting(tbl_contests, contest_id)
	if prize is None:
		prize = total_prize
	assert prize is not None
	players = \
			select_2_strongest(tbl_submissions, contest_id, deadline, leaderboard_type)
	if players is None:
		print(f'Warning: cannot find 2 players in contest {contest_id}')
		return
	else:
		player_i, player_j = players
	save_contest_data( \
		tbl_submissions, contest_id, player_i, player_j,
		deadline, prize, max_daily_submit, percentage, leaderboard_type)

### 2. Splitting Contests by Types

In [None]:
print('>>> Filter 1: How many contests are there having records of submissions from players?')
lst_contests_with_submissions = tbl_submissions['CompetitionId'].unique()
print(len(lst_contests_with_submissions))

# Filter 1
print('>>> We only consider these contests')
tbl_contests = tbl_contests[tbl_contests['Id'].isin(lst_contests_with_submissions)]
tbl_submissions = tbl_submissions.loc[tbl_submissions['CompetitionId'].isin(tbl_contests['Id'])]

In [None]:
print('>>> Filter 2: In above, how many contests are there having public leaderboard?')
lst_contests_with_leaderboard = tbl_contests.loc[tbl_contests['HasLeaderboard']==True]['Id']
print(len(lst_contests_with_submissions))

# Filter 2
print('>>> We only consider these contests')
tbl_contests = tbl_contests[tbl_contests['Id'].isin(lst_contests_with_leaderboard)]
tbl_submissions = tbl_submissions.loc[tbl_submissions['CompetitionId'].isin(tbl_contests['Id'])]

In [None]:
print('>>> Is there never-ending contests?')
forever_date = datetime(2029, 1, 1, 0, 0, 0)
print((tbl_contests['DeadlineDate'] > forever_date).sum())
print((tbl_contests['DeadlineDate'].isna()).sum())

In [None]:
print('>>> How many reward types are there?')
print(tbl_contests['RewardType'].value_counts(dropna=False))

In [None]:
print('>>> List of the number of contests providing multiple prize:')
pprint(tbl_contests['NumPrizes'].value_counts())

In [None]:
# Split contests with prize type
lst_contest_Usd = tbl_contests.loc[tbl_contests['RewardType']=='USD']['Id']
lst_contest_Knowledge = tbl_contests.loc[tbl_contests['RewardType']=='Knowledge']['Id']
lst_contest_Swag = tbl_contests.loc[tbl_contests['RewardType']=='Swag']['Id']
lst_contest_Kudos = tbl_contests.loc[tbl_contests['RewardType']=='Kudos']['Id']
lst_contest_EUR = tbl_contests.loc[tbl_contests['RewardType']=='EUR']['Id']

# Split contests with prize number
lst_contest_1_prize = tbl_contests.loc[tbl_contests['NumPrizes']==1]['Id']
lst_contest_2_prize = tbl_contests.loc[tbl_contests['NumPrizes']==2]['Id']
lst_contest_3_prize = tbl_contests.loc[tbl_contests['NumPrizes']==3]['Id']
lst_contest_4_prize = tbl_contests.loc[tbl_contests['NumPrizes']==4]['Id']
lst_contest_5_prize = tbl_contests.loc[tbl_contests['NumPrizes']==5]['Id']
lst_contest_6_prize = tbl_contests.loc[tbl_contests['NumPrizes']==6]['Id']
lst_contest_7_prize = tbl_contests.loc[tbl_contests['NumPrizes']==7]['Id']
lst_contest_8_prize = tbl_contests.loc[tbl_contests['NumPrizes']==8]['Id']
lst_contest_9_prize = tbl_contests.loc[tbl_contests['NumPrizes']==9]['Id']

In [None]:
print('>>> In the list of contest with 1 single prize, how many of them has positive reward quantity?')
tbl_contest_1_prize = tbl_contests.loc[tbl_contests['Id'].isin(lst_contest_1_prize)]
tbl_contest_1_prize_has_reward = tbl_contest_1_prize.loc[tbl_contest_1_prize['RewardQuantity'] > 0]
pprint(len(tbl_contest_1_prize_has_reward))

pprint('>>> List their reward types:')
pprint(tbl_contest_1_prize_has_reward['RewardType'].value_counts())

In [None]:
print('>>> In the list of contest with 3 prizes, how many of them has positive reward quantity?')
tbl_contest_3_prize = tbl_contests.loc[tbl_contests['Id'].isin(lst_contest_3_prize)]
tbl_contest_3_prize_has_reward = tbl_contest_3_prize.loc[tbl_contest_3_prize['RewardQuantity'] > 0]
pprint(len(tbl_contest_3_prize_has_reward))

pprint('>>> List their reward types:')
pprint(tbl_contest_3_prize_has_reward['RewardType'].value_counts())

In [None]:
print('>>> In the list of contest with USD prize, list the reward quantity:')
tbl_contest_Usd_prize = tbl_contests.loc[tbl_contests['Id'].isin(lst_contest_Usd)]
pprint(tbl_contest_Usd_prize['NumPrizes'].value_counts())

### 3. Select contest providing 1 single USD prize

In [None]:
# select
tbl_contest_1_Usd_prize = tbl_contests[tbl_contests['Id'].isin(lst_contest_Usd)]
tbl_contest_1_Usd_prize = tbl_contest_1_Usd_prize[tbl_contest_1_Usd_prize['Id'].isin(lst_contest_1_prize)]

# filter
tbl_contest_1_Usd_prize = tbl_contest_1_Usd_prize.loc[tbl_contest_1_Usd_prize['RewardQuantity'] > 0]
lst_contest_1_Usd_prize = tbl_contest_1_Usd_prize['Id'].values

In [None]:
lst_contest_1_Usd_prize

Note: 
Finally, we pick 13 contests......

2435 (big), 2445 (small), 2454 (small), 2464 (small), 2467 (normal), 2478 (small),<br>
2549 (small), 2762 (small), 2860 (small), 3507 (small), 3526 (small), 3928 (small),<br>
4493 (small)


In [None]:
# Contest Info
contest_id = 3370

# Contest settings
deadline, total_prize, max_daily_submit, percentage = contest_basic_setting(tbl_contests, contest_id)
print('>>> prize =', total_prize)
print('>>> percentage =', percentage, '%')
print('>>> daily submit (max) =', max_daily_submit)
print('>>> deadline =', deadline)

# Display basic info
tbl_submissions_specific = contest_basic_submission_info(tbl_submissions, contest_id)

# Create leaderboard
leaderboard_pub, leaderboard_pri = leaderboard_fulfill(
		tbl_submissions_specific, deadline, 'Normal')
display(leaderboard_pri.display(-1, 10))
display(leaderboard_pub.display(-1, 10))

In [None]:
# save_to_device(contest_id, 'Percentage_Big')
save_to_device(contest_id, 'Percentage_Small')
# save_to_device(contest_id, 'Normal')

### 4. For those Contests with 2 Prizes

In [None]:
print('>>> In the list of contest with 2 positive USD prizes, what are the values of the prizes?')
tbl_contest_2_Usd_prize = tbl_contests[tbl_contests['Id'].isin(lst_contest_Usd)]
tbl_contest_2_Usd_prize = tbl_contest_2_Usd_prize[tbl_contest_2_Usd_prize['Id'].isin(lst_contest_2_prize)]
tbl_contest_2_Usd_prize = tbl_contest_2_Usd_prize.loc[tbl_contest_2_Usd_prize['RewardQuantity'] > 0]
pprint(tbl_contest_2_Usd_prize['Id'].values)

Note: 
 - 2489: 2 $250
 - 2499: $5,000 vs $3,000 
 - 3023: $7,000 vs $2,500
 - 3353: $8,000 vs $2,000
 - 3366: $350 vs $150
 - 3403: complicated
 - 3471: $350 vs $150
 - 3477: $350 vs $150
 - 3509: $350 vs $150
 - 4066: $10,000 vs $5,000

Finally, we pick 

2489 (small), 3353 (small), 3366 (small), 3509 (small), 

In [None]:
contest_2prize_gap = {
	2489: 100,
	2499: 2000,
	3023: 4500,
	3353: 6000,
	3366: 200,
	3471: 200,
	3477: 200,
	3509: 200,
	4066: 5000
}

In [None]:
## check prize info by:
# display(HTML(tbl_contest_2_Usd_prize.loc[tbl_contest_2_Usd_prize['Id']==4066, ['Overview']].values[0][0]))

In [None]:
# Contest Info
contest_id = 3509

# Contest settings
deadline, total_prize, max_daily_submit, percentage = contest_basic_setting(tbl_contests, contest_id)
print('>>> percentage =', percentage, '%')
print('>>> daily submit (max) =', max_daily_submit)
print('>>> deadline =', deadline)

# Display basic info
tbl_submissions_specific = contest_basic_submission_info(tbl_submissions, contest_id)

# Create leaderboard
leaderboard_pub, leaderboard_pri = leaderboard_fulfill(
		tbl_submissions_specific, deadline, 'Normal')
display(leaderboard_pri.display(-1, 10))
display(leaderboard_pub.display(-1, 10))

In [None]:
# save_to_device(contest_id, 'Percentage_Big', prize=contest_2prize_gap[contest_id])
save_to_device(contest_id, 'Percentage_Small', prize=contest_2prize_gap[contest_id])
# save_to_device(contest_id, 'Normal', prize=contest_2prize_gap[contest_id])

### 4. For those Contests with 3  Prizes

In [None]:
print('>>> In the list of contest with 3 positive USD prizes, what are the values of the prizes?')
tbl_contest_3_Usd_prize = tbl_contests[tbl_contests['Id'].isin(lst_contest_Usd)]
tbl_contest_3_Usd_prize = tbl_contest_3_Usd_prize[tbl_contest_3_Usd_prize['Id'].isin(lst_contest_3_prize)]
tbl_contest_3_Usd_prize = tbl_contest_3_Usd_prize.loc[tbl_contest_3_Usd_prize['RewardQuantity'] > 0]
pprint(tbl_contest_3_Usd_prize['Id'].values)

In [None]:
contest_3prize_gap = {
	2509: 3000,  # normal
	2518: 3000,  # normal
	2551: 1500,  # small
	2602: 2000,
	2606: 1500,
	2609: 1000,
	2667: 30000,  # small
	2732: 5000,
	2748: 3000,  # normal
	2749: 3000,  # small
	2780: 4000,
	2840: 10000,
	2888: 2000,
	2917: 3000,  # normal
	2969: 6500,
	2975: 12000,  # normal
	2984: 5000,  # normal
	3043: 5000,
	3046: 5000,  # normal
	3064: 200,  # small
	3080: 90,  # small
	3084: 7000,
	3175: 6000,
	3288: 1000,  # small
	3316: 4000,
	3338: 1500,  # small
	3342: 1000,
	3354: 200,
	3517: 100,  # small
	3599: 1000,
	3641: 1500,  # small
	3756: 2000,
	3772: 2500,
	3774: 500,  # small
	3800: 3000,  # small
	3887: 3000,  # normal
	3926: 300,   # small
	3929: 8000,  # small
	3934: 6000,
	3951: 4000,  # normal
	3960: 8000,  # small
	3966: 3000,
	3978: 55000,
	4031: 5000,  # small
	4043: 200,  # small
	4104: 20000,  # small
	4117: 9000,
	4120: 7000,
	4272: 5000,
	4280: 2000,
	4366: 8000,  # small
	4407: 4000,  # small
	4438: 4000,
	4453: 2000,  # small
	4467: 5000,
	4471: 4000,  # normal
	4477: 2000,  # small
	4481: 15000,  # normal
	4488: 2000,  # small
	4521: 2000,
	4571: 30000,
	4594: 5000,
	4657: 4000,  # small
	4699: 5000,  # small
	4729: 75000,
	4852: 5000,
	4853: 8000,
	4986: 10000,  # small
	5048: 10000,
	5056: 5000,  # small
	5144: 20000,  # small
	5174: 3000,  # small
	5229: 15000,  # small
	5260: 4000,
	5261: 10000,  # small
	5340: 5000,
	5357: 5000,  # small
	5390: 4000,  # small
	5497: 4000,  # small
	5558: 10000,
	5916: 20000,  # normal
	6116: 4000,
	6243: 30000,
	6277: 4000,
	6322: 10000,  # small
	6392: 4000,
	6565: 4000,
	6644: 4000,  # normal
	6768: 4000,
	6841: 7000,
	6927: 4000,  # small
	7042: 4000,
	7043: 4000,
	7082: 4000,  # normal
	7115: 4000,  # small
	7162: 1000,  # small
	7163: 1000,
	7277: 4000,
	7380: 10000,
	7391: 5000,
	7456: 500,
	7559: 30000,
	7634: 2000,  # small
	7878: 4000,  # small
	8076: 6000,  # small
	8078: 4000,  # small
	8219: 400,   # small
	8220: 400,
	8310: 10000,
	8311: 10000,
	8396: 500,  # small
	8540: 5000,  # small
	8586: 4000,
	8899: 400,
	9120: 10000,  # small
	9949: 5000,  # small
	9951: 3000,
	9988: 10000,
	9993: 4000,
	10038: 3000,
	10200: 4000,  # small
	10384: 4000,
	10684: 4000,  # small
	10700: 1000,
	10733: 2000,
	10737: 4000,
	11836: 4000,
	12797: 4000,
	12863: 4000,
	13333: 2000,  # small
	14239: 1000,
	14242: 3000,  # small
	14420: 20000,  # small
	14897: 3000,
	15696: 35000,
	15768: 4000,
	16245: 4000,
	16295: 2000,
	17233: 2000,
	18045: 4000,  # small
	18329: 1000,
	18647: 4000,
	19018: 8000,  # small
	19231: 4000,
	19233: 4000,
	19596: 4000,
	19989: 4000,
	19991: 4000,  # small
	20270: 4000,  # small
	20604: 15000,
	21669: 1000,  # small
	21723: 1000,  # normal
	22111: 4000,
	22422: 10000,
	22559: 2000,
	22962: 4000,  # small
	23249: 1000,  # small
	23652: 1000,  # small
	23823: 4000,
	24286: 5000,
	24800: 6000,
	25383: 4000,
	25954: 1000,
	26933: 2000,
	27783: 3000,
	27923: 4000,
	29594: 100,
	33246: 1000,
	37077: 4000,  # small
	37190: 4000,
	37244: 4000,
	37333: 2000,
	38128: 8000,  # small
	38257: 6000,
	38760: 5000,  # small
	59291: 5000,
}

In [None]:
# # check prize info by:
contest_id = 38760

display(HTML( \
	tbl_contest_3_Usd_prize.loc[tbl_contest_3_Usd_prize['Id']==contest_id, ['Overview']].values[0][0]
))

In [None]:
# Contest settings
deadline, total_prize, max_daily_submit, percentage = contest_basic_setting(tbl_contests, contest_id)
print('>>> percentage =', percentage, '%')
print('>>> daily submit (max) =', max_daily_submit)
print('>>> deadline =', deadline)

# Display basic info
tbl_submissions_specific = contest_basic_submission_info(tbl_submissions, contest_id)

# Create leaderboard
leaderboard_pub, leaderboard_pri = leaderboard_fulfill(
		tbl_submissions_specific, deadline, 'Normal')
display(leaderboard_pri.display(-1, 10))
display(leaderboard_pub.display(-1, 10))

In [None]:
# save_to_device(contest_id, 'Percentage_Big', prize=contest_3prize_gap[contest_id])
save_to_device(contest_id, 'Percentage_Small', prize=contest_3prize_gap[contest_id])
# save_to_device(contest_id, 'Normal', prize=contest_3prize_gap[contest_id])