# Explore `meta-kaggle` Dataset

## --- Submissions

Pre-requisites: 

1. Install Python package `kaggle`
2. [Create token](https://www.kaggle.com/docs/api) and 
3. Put the `kaggle.json` file to the `.kaggle` folder of your home directory
4. Download dataset `meta-kaggle` if haven't yet, by excuting: 
   ```python
   import api
   api.download_rawdata(rawdata_path)
   ```

In [1]:
# std library
import os
from datetime import datetime
from pprint import pprint

# third-party
import pandas as pd

# local
from datafilter import table_filter, table_header

In [2]:
rawdata_path = os.path.abspath('./__rawdata__')

### 1. Mapping Datasets

Create a dataset of submissions, containing the following fields:
- Id 
- TeamId 
- CompetitionId 
- SubmissionDate
- PublicScore
- PrivateScore

In [3]:
# Submissions
# - TeamId
# - SubmissionDate
tbl_submissions = table_filter( \
	'Submissions', rawdata_path,
	fields = ['Id', 'TeamId', 'SubmissionDate', 'IsAfterDeadline',
		'PublicScoreLeaderboardDisplay', 'PrivateScoreFullPrecision'],
	fields_index = ['Id'],
	fields_datetime = ['SubmissionDate'],
)
# drop all submissions after deadlines
tbl_submissions = tbl_submissions[~tbl_submissions['IsAfterDeadline']]
tbl_submissions.drop('IsAfterDeadline', axis=1, inplace=True)
# rename long name
tbl_submissions.rename(columns={
	'PublicScoreLeaderboardDisplay': 'PublicScore',
	'PrivateScoreFullPrecision': 'PrivateScore'
}, inplace=True)
# drop duplicated
## One team can submit only once at a time point
tbl_submissions = tbl_submissions.drop_duplicates(subset=['SubmissionDate', 'TeamId'])

In [4]:
# Mapping: Teams
# - CompetitionId
map_teams_competitions = table_filter( \
	'Teams', rawdata_path,
	fields = ['Id', 'CompetitionId'],
	fields_index = ['Id'],
)

In [5]:
# Merge `Submissions` and `TeamsCompetition`
tbl_submissions = pd.merge(tbl_submissions, map_teams_competitions,
	how = 'left', left_on = 'TeamId', right_on = 'Id', sort = False,
	suffixes=('', '_Team'))
tbl_submissions.drop(columns='Id_Team', inplace=True)
tbl_submissions.dtypes

Id                         int64
TeamId                     int64
SubmissionDate    datetime64[ns]
PublicScore              float64
PrivateScore             float64
CompetitionId              int64
dtype: object

### 2. Splitting Contests by Types

In [6]:
# All fields in `Competitions`
table_header('Competitions', rawdata_path)

{'Id': ['2408'],
 'Slug': ['Eurovision2010'],
 'Title': ['Forecast Eurovision Voting'],
 'Subtitle': ["This competition requires contestants to forecast the voting for this year's Eurovision Song Contest in Norway on May 25th, 27th and 29th."],
 'HostSegmentTitle': ['Featured'],
 'ForumId': ['2'],
 'OrganizationId': [''],
 'EnabledDate': ['04/07/2010 07:57:43'],
 'DeadlineDate': ['05/25/2010 18:00:00'],
 'ProhibitNewEntrantsDeadlineDate': [''],
 'TeamMergerDeadlineDate': [''],
 'TeamModelDeadlineDate': [''],
 'ModelSubmissionDeadlineDate': [''],
 'FinalLeaderboardHasBeenVerified': ['True'],
 'HasKernels': ['True'],
 'OnlyAllowKernelSubmissions': ['False'],
 'HasLeaderboard': ['False'],
 'LeaderboardPercentage': ['10'],
 'ScoreTruncationNumDecimals': ['5'],
 'EvaluationAlgorithmAbbreviation': ['AE'],
 'EvaluationAlgorithmName': ['Absolute Error'],
 'EvaluationAlgorithmDescription': ['Sum of absolute values of all errors.'],
 'EvaluationAlgorithmIsMax': ['False'],
 'MaxDailySubmissions':

In [7]:
# Create Table of Contests
tbl_contests = table_filter( \
	'Competitions', rawdata_path,
	fields=['Id',
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate',
		'HasLeaderboard', 'LeaderboardPercentage', 'MaxDailySubmissions',
		'RewardType', 'RewardQuantity', 'NumPrizes',
		'FinalLeaderboardHasBeenVerified', 'EvaluationAlgorithmName'
	],
	fields_datetime=[
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate'
	],
	fields_index=['Id'],
)
len(tbl_contests)

9446

In [8]:
print('>>> Filter 1: How many contests are there having records of submissions from players?')
lst_contests_with_submissions = tbl_submissions['CompetitionId'].unique()
print(len(lst_contests_with_submissions))

# Filter 1
print('>>> We only consider these contests')
tbl_contests = tbl_contests[tbl_contests['Id'].isin(lst_contests_with_submissions)]
tbl_submissions = tbl_submissions.loc[tbl_submissions['CompetitionId'].isin(tbl_contests['Id'])]

>>> Filter 1: How many contests are there having records of submissions from players?
5694
>>> We only consider these contests


In [9]:
print('>>> Filter 2: In above, how many contests are there having public leaderboard?')
lst_contests_with_leaderboard = tbl_contests.loc[tbl_contests['HasLeaderboard']==True]['Id']
print(len(lst_contests_with_submissions))

# Filter 2
print('>>> We only consider these contests')
tbl_contests = tbl_contests[tbl_contests['Id'].isin(lst_contests_with_leaderboard)]
tbl_submissions = tbl_submissions.loc[tbl_submissions['CompetitionId'].isin(tbl_contests['Id'])]

>>> Filter 2: In above, how many contests are there having public leaderboard?
5694
>>> We only consider these contests


In [10]:
print('>>> Is there never-ending contests?')
forever_date = datetime(2029, 1, 1, 0, 0, 0)
print((tbl_contests['DeadlineDate'] > forever_date).sum())
print((tbl_contests['DeadlineDate'].isna()).sum())

>>> Is there never-ending contests?
0
0


In [11]:
print('>>> How many reward types are there?')
print(tbl_contests['RewardType'].value_counts(dropna=False))

>>> How many reward types are there?
RewardType
NaN          4778
USD           407
Knowledge     381
Swag           95
Jobs           14
Kudos          10
Prizes          6
EUR             1
Name: count, dtype: int64


In [12]:
print('>>> List of the number of contests providing multiple prize:')
pprint(tbl_contests['NumPrizes'].value_counts())

>>> List of the number of contests providing multiple prize:
NumPrizes
1     4722
0      498
3      292
5       82
4       28
10      18
6       16
2       11
8       10
7        9
9        4
13       1
12       1
Name: count, dtype: int64


In [13]:
# Split contests with prize type
lst_contest_Usd = tbl_contests.loc[tbl_contests['RewardType']=='USD']['Id']
lst_contest_Knowledge = tbl_contests.loc[tbl_contests['RewardType']=='Knowledge']['Id']
lst_contest_Swag = tbl_contests.loc[tbl_contests['RewardType']=='Swag']['Id']
lst_contest_Kudos = tbl_contests.loc[tbl_contests['RewardType']=='Kudos']['Id']
lst_contest_EUR = tbl_contests.loc[tbl_contests['RewardType']=='EUR']['Id']

# Split contests with prize number
lst_contest_1_prize = tbl_contests.loc[tbl_contests['NumPrizes']==1]['Id']
lst_contest_2_prize = tbl_contests.loc[tbl_contests['NumPrizes']==2]['Id']
lst_contest_3_prize = tbl_contests.loc[tbl_contests['NumPrizes']==3]['Id']
lst_contest_4_prize = tbl_contests.loc[tbl_contests['NumPrizes']==4]['Id']
lst_contest_5_prize = tbl_contests.loc[tbl_contests['NumPrizes']==5]['Id']
lst_contest_6_prize = tbl_contests.loc[tbl_contests['NumPrizes']==6]['Id']
lst_contest_7_prize = tbl_contests.loc[tbl_contests['NumPrizes']==7]['Id']
lst_contest_8_prize = tbl_contests.loc[tbl_contests['NumPrizes']==8]['Id']
lst_contest_9_prize = tbl_contests.loc[tbl_contests['NumPrizes']==9]['Id']

### 3. Select contest providing single USD prize

In [14]:
# select
tbl_contest_1_Usd_prize = tbl_contests[tbl_contests['Id'].isin(lst_contest_Usd)]
tbl_contest_1_Usd_prize = tbl_contest_1_Usd_prize[tbl_contest_1_Usd_prize['Id'].isin(lst_contest_1_prize)]

# filter
tbl_contest_1_Usd_prize = tbl_contest_1_Usd_prize.loc[tbl_contest_1_Usd_prize['RewardQuantity'] > 0]
lst_contest_1_Usd_prize = tbl_contest_1_Usd_prize['Id'].values

In [15]:
lst_contest_1_Usd_prize, len(lst_contest_1_Usd_prize)

(array([2435, 2445, 2448, 2452, 2454, 2464, 2467, 2478, 2479, 2487, 2488,
        2496, 2549, 2589, 2762, 2860, 2895, 2963, 3065, 3294, 3364, 3370,
        3377, 3385, 3386, 3469, 3493, 3507, 3521, 3526, 3586, 3706, 3867,
        3928, 3973, 3984, 4195, 4378, 4383, 4493, 4495, 4704]),
 42)

### 4. Work on all Contests

In [16]:
from _data_clean import \
	contest_basic_setting, \
	contest_basic_submission_info, \
	leaderboard_fulfill, \
	select_2_strongest, \
	save_contest_data

In [17]:
for contest_id in lst_contest_1_Usd_prize:
	deadline, prize, max_daily_submit, percentage = contest_basic_setting(tbl_contests, contest_id)
	players = select_2_strongest(tbl_submissions, contest_id, deadline)
	if players is None:
		print(f'Warning: cannot find 2 players in contest {contest_id}')
		continue
	else:
		player_i, player_j = players
	save_contest_data( \
		tbl_submissions, contest_id, player_i, player_j, deadline, prize, max_daily_submit, percentage)



#### For Those Failed, Double Check 

In [18]:
# select one
contest_id = 2448

# Contest settings
deadline, prize, max_daily_submit, percentage = contest_basic_setting(tbl_contests, contest_id)
print('>>> prize =', prize)
print('>>> percentage =', percentage, '%')
print('>>> daily submit (max) =', max_daily_submit)
print('>>> deadline =', deadline)

# Display basic info
tbl_submissions_specific = contest_basic_submission_info(tbl_submissions, contest_id)

# Create leaderboard
leaderboard_pub, leaderboard_pri = leaderboard_fulfill(tbl_submissions_specific, deadline)

>>> prize = 500.0
>>> percentage = 20 %
>>> daily submit (max) = 2
>>> deadline = 2010-09-20 00:00:00
>>> How many teams are there in this contest?
55
>>> How many submissions are there in total?
275
>>> List the number of submissions for the most active 5 teams:
TeamId
963     19
992     19
1033    18
957     16
1061    16
dtype: int64
