# Explore `meta-kaggle` Dataset

## --- Data Clean

Pre-requisites: 

1. Install Python package `kaggle`
2. [Create token](https://www.kaggle.com/docs/api) and 
3. Put the `kaggle.json` file to the `.kaggle` folder of your home directory
4. Download dataset `meta-kaggle` if haven't yet, by excuting: 
   ```python
   import api
   api.download_rawdata(rawdata_path)
   ```

In [1]:
# std library
import os
import json
from datetime import datetime, timedelta
from pprint import pprint

# third-party
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# local
from datafilter import table_filter
from leaderboard import Leaderboard

In [2]:
rawdata_path = os.path.abspath('./__rawdata__')

### 1. Mapping Datasets

Create a dataset of submissions, containing the following fields:
- Id 
- TeamId 
- CompetitionId 
- SubmissionDate
- PublicScore
- PrivateScore

In [3]:
# Submissions
# - TeamId
# - SubmissionDate
tbl_submissions = table_filter( \
	'Submissions', rawdata_path,
	fields = ['Id', 'TeamId', 'SubmissionDate', 'IsAfterDeadline',
		'PublicScoreLeaderboardDisplay', 'PrivateScoreFullPrecision'],
	fields_index = ['Id'],
	fields_datetime = ['SubmissionDate'],
)
# drop all submissions after deadlines
tbl_submissions = tbl_submissions[~tbl_submissions['IsAfterDeadline']]
tbl_submissions.drop('IsAfterDeadline', axis=1, inplace=True)
# rename long name
tbl_submissions.rename(columns={
	'PublicScoreLeaderboardDisplay': 'PublicScore',
	'PrivateScoreFullPrecision': 'PrivateScore'
}, inplace=True)
# drop duplicated
## One team can submit only once at a time point
tbl_submissions = tbl_submissions.drop_duplicates(subset=['SubmissionDate', 'TeamId'])

In [4]:
# Mapping: Teams
# - CompetitionId
map_teams_competitions = table_filter( \
	'Teams', rawdata_path,
	fields = ['Id', 'CompetitionId'],
	fields_index = ['Id'],
)

In [5]:
# Merge `Submissions` and `TeamsCompetition`
tbl_submissions = pd.merge(tbl_submissions, map_teams_competitions,
	how = 'left', left_on = 'TeamId', right_on = 'Id', sort = False,
	suffixes=('', '_Team'))
tbl_submissions.drop(columns='Id_Team', inplace=True)
tbl_submissions.dtypes

Id                         int64
TeamId                     int64
SubmissionDate    datetime64[ns]
PublicScore              float64
PrivateScore             float64
CompetitionId              int64
dtype: object

### 2. Splitting Contests by Types

In [6]:
# Table of Contests
tbl_contests = table_filter( \
	'Competitions', rawdata_path,
	fields=['Id',
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate',
		'HasLeaderboard', 'LeaderboardPercentage', 'MaxDailySubmissions',
		'RewardType', 'RewardQuantity', 'NumPrizes',
		'FinalLeaderboardHasBeenVerified'
	],
	fields_datetime=[
		'EnabledDate', 'DeadlineDate', 'ProhibitNewEntrantsDeadlineDate',
		'TeamMergerDeadlineDate', 'TeamModelDeadlineDate', 'TeamModelDeadlineDate',
		'ModelSubmissionDeadlineDate'
	],
	fields_index=['Id'],
)
len(tbl_contests)

9442

In [7]:
print('>>> Filter 1: How many contests are there having records of submissions from players?')
lst_contests_with_submissions = tbl_submissions['CompetitionId'].unique()
print(len(lst_contests_with_submissions))

# Filter 1
tbl_contests = tbl_contests[tbl_contests['Id'].isin(lst_contests_with_submissions)]
tbl_submissions = tbl_submissions.loc[tbl_submissions['CompetitionId'].isin(tbl_contests['Id'])]

>>> Filter 1: How many contests are there having records of submissions from players?
5693


In [8]:
print('>>> Filter 2: In above, how many contests are there having public leaderboard?')
lst_contests_with_leaderboard = tbl_contests.loc[tbl_contests['HasLeaderboard']==True]['Id']
print(len(lst_contests_with_submissions))

# Filter 2
tbl_contests = tbl_contests[tbl_contests['Id'].isin(lst_contests_with_leaderboard)]
tbl_submissions = tbl_submissions.loc[tbl_submissions['CompetitionId'].isin(tbl_contests['Id'])]

>>> Filter 2: In above, how many contests are there having public leaderboard?
5693


In [9]:
print('>>> Is there never-ending contests?')
forever_date = datetime(2029, 1, 1, 0, 0, 0)
print((tbl_contests['DeadlineDate'] > forever_date).sum())
print((tbl_contests['DeadlineDate'].isna()).sum())

>>> Is there never-ending contests?
0
0


In [10]:
print('>>> How many reward types are there?')
print(tbl_contests['RewardType'].value_counts(dropna=False))

>>> How many reward types are there?
RewardType
NaN          4777
USD           407
Knowledge     381
Swag           95
Jobs           14
Kudos          10
Prizes          6
EUR             1
Name: count, dtype: int64


In [11]:
print('>>> List of the number of contests providing multiple prize:')
pprint(tbl_contests['NumPrizes'].value_counts())

>>> List of the number of contests providing multiple prize:
NumPrizes
1     4721
0      498
3      292
5       82
4       28
10      18
6       16
2       11
8       10
7        9
9        4
13       1
12       1
Name: count, dtype: int64


In [12]:
# Split contests with prize type
lst_contest_Usd = tbl_contests.loc[tbl_contests['RewardType']=='USD']['Id']
lst_contest_Knowledge = tbl_contests.loc[tbl_contests['RewardType']=='Knowledge']['Id']
lst_contest_Swag = tbl_contests.loc[tbl_contests['RewardType']=='Swag']['Id']
lst_contest_Kudos = tbl_contests.loc[tbl_contests['RewardType']=='Kudos']['Id']
lst_contest_EUR = tbl_contests.loc[tbl_contests['RewardType']=='EUR']['Id']

# Split contests with prize number
lst_contest_1_prize = tbl_contests.loc[tbl_contests['NumPrizes']==1]['Id']
lst_contest_2_prize = tbl_contests.loc[tbl_contests['NumPrizes']==2]['Id']
lst_contest_3_prize = tbl_contests.loc[tbl_contests['NumPrizes']==3]['Id']
lst_contest_4_prize = tbl_contests.loc[tbl_contests['NumPrizes']==4]['Id']
lst_contest_5_prize = tbl_contests.loc[tbl_contests['NumPrizes']==5]['Id']
lst_contest_6_prize = tbl_contests.loc[tbl_contests['NumPrizes']==6]['Id']
lst_contest_7_prize = tbl_contests.loc[tbl_contests['NumPrizes']==7]['Id']
lst_contest_8_prize = tbl_contests.loc[tbl_contests['NumPrizes']==8]['Id']
lst_contest_9_prize = tbl_contests.loc[tbl_contests['NumPrizes']==9]['Id']

### 3. Select contest providing single USD prize

In [13]:
# select
tbl_contest_1_Usd_prize = tbl_contests[tbl_contests['Id'].isin(lst_contest_Usd)]
tbl_contest_1_Usd_prize = tbl_contest_1_Usd_prize[tbl_contest_1_Usd_prize['Id'].isin(lst_contest_1_prize)]

# filter
tbl_contest_1_Usd_prize = tbl_contest_1_Usd_prize.loc[tbl_contest_1_Usd_prize['RewardQuantity'] > 0]
lst_contest_1_Usd_prize = tbl_contest_1_Usd_prize['Id'].values
len(lst_contest_1_Usd_prize)

42

In [14]:
print(lst_contest_1_Usd_prize)

[2435 2445 2448 2452 2454 2464 2467 2478 2479 2487 2488 2496 2549 2589
 2762 2860 2895 2963 3065 3294 3364 3370 3377 3385 3386 3469 3493 3507
 3521 3526 3586 3706 3867 3928 3973 3984 4195 4378 4383 4493 4495 4704]


#### Contest `2435` for example

#### (1) Contest Setting

In [60]:
def contest_basic_info(contest_id: int):
	tbl_contest_info = tbl_contests.loc[tbl_contests['Id'] == contest_id]
	# deadline
	deadline = tbl_contest_info['DeadlineDate'].iloc[0].to_pydatetime()
	deadline = deadline.replace(hour=0, minute=0, second=0, microsecond=0)
	deadline += timedelta(days=1)
	return deadline

In [61]:
deadline = contest_basic_info(2435)

#### (2) Leaderboard

In [58]:
def contest_basic_submission_info(contest_id: int) -> pd.DataFrame:
	tbl = tbl_submissions.loc[tbl_submissions['CompetitionId']==contest_id]
	print('>>> How many teams are there in this contest?')
	print(tbl['TeamId'].unique().size)
	print('>>> How many submissions are there in total?')
	print(len(tbl))
	print('>>> List the number of submissions for the most active 5 teams:')
	print(tbl.groupby('TeamId').size().sort_values(ascending=False).head(5))
	return tbl

def randomize_within_day(group: pd.Series, seed = 1234):
	rgn = np.random.default_rng(seed)
	seconds = rgn.choice(range(86400), size=len(group), replace=False)
	seconds.sort()
	randomized_times = group + pd.to_timedelta(seconds, unit='s')
	return randomized_times

def leaderboard_fulfill(tbl_contest_submissions, deadline: datetime):
	leaderboard_public = Leaderboard()
	leaderboard_private = Leaderboard()
	tbl_contest_submissions_sorted = tbl_contest_submissions.sort_values(by='SubmissionDate')
	submission_dates = tbl_contest_submissions_sorted['SubmissionDate']
	submission_datetimes = submission_dates.groupby(submission_dates).apply(randomize_within_day).reset_index(level=0, drop=True)
	for idx_row, row in tbl_contest_submissions_sorted.iterrows():
		time = submission_datetimes[idx_row].to_pydatetime()
		if time > deadline:
			break
		team_id = row['TeamId']
		score_pub = row['PublicScore']
		score_pri = row['PrivateScore']
		leaderboard_public.refresh(time, team_id, score_pub)
		leaderboard_private.refresh(time, team_id, score_pri)
	return leaderboard_public, leaderboard_private

In [59]:
# Display basic info
tbl_submissions_2435 = contest_basic_submission_info(2435)

# Create leaderboard
leaderboard_pub, leaderboard_pri = leaderboard_fulfill(tbl_submissions_2435, deadline)

>>> How many teams are there in this contest?
107
>>> How many submissions are there in total?
361
>>> List the number of submissions for the most active 5 teams:
TeamId
751    23
697    23
788    19
752    18
737    18
dtype: int64


#### Select the 2 Strongest Players

In [19]:
_, tbl_submissions_2435_pri_top = leaderboard_pri.display(-1, 5)
print(tbl_submissions_2435_pri_top)

   rank      score  submit_count
0   788  77.312103            19
1   751  76.156097            23
2   552  70.375702             8
3   509  70.231201             7
4   690  69.075104            11


We select the top 2 in private leaderboard.

#### Save Contest Data to `.json` File

In [20]:
# print(leaderboard_pub.submission_records_of(751))
# print(leaderboard_pub.submission_records_of(788))

In [21]:
tbl_hat_y = leaderboard_pub.real_time_gap_between(788, 751, delta=timedelta(hours=1))

In [22]:
leaderboard_pub.last_submission_time()

datetime.datetime(2010, 8, 2, 23, 42, 25)

In [23]:
tbl_hat_y

Unnamed: 0,time,hat_x_i,hat_x_j,hat_y
0,2010-07-16 00:00:00,66.82689,53.84619,12.9807
1,2010-07-16 01:00:00,66.82689,53.84619,12.9807
2,2010-07-16 02:00:00,66.82689,53.84619,12.9807
3,2010-07-16 03:00:00,66.82689,53.84619,12.9807
4,2010-07-16 04:00:00,66.82689,53.84619,12.9807
...,...,...,...,...
428,2010-08-02 20:00:00,73.07689,94.23079,-21.1539
429,2010-08-02 21:00:00,73.07689,94.23079,-21.1539
430,2010-08-02 22:00:00,73.07689,94.23079,-21.1539
431,2010-08-02 23:00:00,73.07689,94.23079,-21.1539


In [78]:
def save_contest_data(contest_id, team_i_id: int, team_j_id: int):
	# json file & utils
	wd = os.getcwd()
	wd_synthetic_data = os.path.join(wd, f'__jsondata__/contest_{contest_id}.json')
	roundint = lambda x: int(round(x))

	# contest setting
	contest_info = tbl_contests.loc[tbl_contests['Id'] == contest_id]
	param_theta = contest_info['RewardQuantity'].values[0] / 1000
	param_r = 15
	time_unit_2f = 1 / 24

	# observations
	contest_submissions = tbl_submissions.loc[tbl_submissions['CompetitionId']==contest_id]
	leaderboard_pub, _ = leaderboard_fulfill(contest_submissions, deadline)

	tbl_hat_y = leaderboard_pub.real_time_gap_between(team_i_id, team_j_id, delta=timedelta(hours=1))
	start_time: datetime = tbl_hat_y['time'].iloc[0].to_pydatetime() - timedelta(hours=1)
	end_time: datetime = tbl_hat_y['time'].iloc[-1].to_pydatetime() + timedelta(hours=1)
	observed_gap_dynamic = tbl_hat_y['hat_y']

	observed_i_commits = leaderboard_pub.submission_records_of(team_i_id)['time']
	observed_i_commits = observed_i_commits.loc[observed_i_commits > start_time].tolist()
	observed_j_commits = leaderboard_pub.submission_records_of(team_j_id)['time']
	observed_j_commits = observed_j_commits.loc[observed_j_commits > start_time].tolist()

	with open(wd_synthetic_data, 'w') as f:
		json.dump({
			'theta': param_theta,
			'ratio': param_r,
			'N_Delta': roundint((end_time - start_time).total_seconds() / 3600),
			'Delta2f': time_unit_2f,
			'Ni': len(observed_i_commits),
			'Nj': len(observed_j_commits),
			'hat_t_i': [(dt - start_time).total_seconds() / 3600 for dt in observed_i_commits],
			'hat_t_j': [(dt - start_time).total_seconds() / 3600 for dt in observed_j_commits],
			'hat_y': observed_gap_dynamic.tolist(),
		}, f, indent=4)


In [79]:
save_contest_data(2435, 751, 788)