# Modified from https://www.kaggle.com/code/jtrotman/winning-team-submission-traces/notebook

In [4]:
import gc, os, sys, time
import pandas as pd, numpy as np
from unidecode import unidecode
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from IPython.display import HTML, display

In [59]:
# CSV_DIR = Path('..', 'input', 'meta-kaggle')
CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input', 'meta-kaggle')
if not CSV_DIR.is_dir():
    CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input')

def read_csv_filtered(csv, col, values):
    dfs = [df.loc[df[col].isin(values)]
           for df in pd.read_csv(CSV_DIR / csv, chunksize=100000, low_memory=False)]
    return pd.concat(dfs, axis=0)

comps = pd.read_csv(CSV_DIR / 'Competitions.csv').set_index('Id')
comps = comps.query("HostSegmentTitle != 'InClass'")
idx = comps.EvaluationAlgorithmName.isnull()
comps.loc[idx, 'EvaluationAlgorithmName'] = comps.loc[idx, 'EvaluationAlgorithmAbbreviation']

comps['EvaluationLabel'] = comps.EvaluationAlgorithmAbbreviation
idx = comps.EvaluationLabel.str.len() > 30
comps.loc[idx, 'EvaluationLabel'] = comps.loc[idx, 'EvaluationLabel'].str.replace(r'[^A-Z\d\-]', '', regex=True)

comps['DeadlineDate'] = pd.to_datetime(comps.DeadlineDate)
comps['EnabledDate'] = pd.to_datetime(comps.EnabledDate)
comps['DeadlineDateText'] = comps.DeadlineDate.dt.strftime('%c')
comps['EnabledDateText'] = comps.EnabledDate.dt.strftime('%c')
comps['Year'] = comps.DeadlineDate.dt.year
comps['RewardQuantity'].fillna('', inplace=True)
comps['Days'] = (comps.DeadlineDate - comps.EnabledDate) / pd.Timedelta(1, 'd')
comps['FinalWeek'] = (comps.DeadlineDate - pd.Timedelta(1, 'w'))

teams = read_csv_filtered('Teams.csv', 'CompetitionId', comps.index).set_index('Id')
# Just the winning teams (Change to different settings)
# teams = teams.query('PrivateLeaderboardRank==1').copy()
teams = teams.query('PrivateLeaderboardRank<=20').copy()

tmemb = read_csv_filtered('TeamMemberships.csv', 'TeamId', teams.index).set_index('Id')
users = read_csv_filtered('Users.csv', 'Id', tmemb.UserId)
tmemb = tmemb.merge(users, left_on='UserId', right_on='Id')

# Submissions
subs = read_csv_filtered('Submissions.csv', 'TeamId', tmemb.TeamId)
subs = subs.rename(columns={'PublicScoreFullPrecision': 'Public'})
subs = subs.rename(columns={'PrivateScoreFullPrecision': 'Private'})
subs['SubmissionDate'] = pd.to_datetime(subs.SubmissionDate)

asfloats = ['PublicScoreLeaderboardDisplay',
            'Public',
            'PrivateScoreLeaderboardDisplay',
            'Private',]

subs[asfloats] = subs[asfloats].astype(float)
# subs.IsAfterDeadline.mean()

subs = subs.query('not IsAfterDeadline').copy()
subs['CompetitionId'] = subs.TeamId.map(teams.CompetitionId)
subs['CompetitionSlug'] = subs.CompetitionId.map(comps.Slug)
subs['TeamName'] = subs.TeamId.map(teams.TeamName)

# subs['CompetitionId'].nunique()

# values some competitions use as invalid scores
for bad in [99, 999999]:
    for c in asfloats:
        idx = (subs[c] == bad)
        subs.loc[idx, c] = subs.loc[idx, c].replace({bad: np.nan})

# Display order: most recent competitions first
subs = subs.sort_values(['CompetitionId', 'Id'], ascending=[False, True])

In [58]:
teams

Unnamed: 0_level_0,CompetitionId,TeamLeaderId,TeamName,ScoreFirstSubmittedDate,LastSubmissionDate,PublicLeaderboardSubmissionId,PrivateLeaderboardSubmissionId,IsBenchmark,Medal,MedalAwardDate,PublicLeaderboardRank,PrivateLeaderboardRank
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
504,2435,727.0,IFM_bioinformatics,,05/12/2010,2203.0,2246.0,False,1.0,07/15/2016,13.0,9.0
505,2435,728.0,Amsterdam,,05/19/2010,2306.0,2308.0,False,2.0,07/15/2016,18.0,11.0
508,2435,745.0,chaos,,05/07/2010,2212.0,2214.0,False,1.0,07/15/2016,43.0,3.0
509,2435,703.0,Rajstennaj Barrabas,,08/02/2010,4616.0,4672.0,False,1.0,07/15/2016,1.0,4.0
512,2435,726.0,Team Ben,,05/10/2010,2235.0,2237.0,False,2.0,07/15/2016,32.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8424973,33104,175917.0,Patrick Blackwill,,04/26/2022,26247981.0,26247981.0,False,,,10.0,8.0
8470005,33104,5120783.0,Azzam Radman,,04/30/2022,26318982.0,26318982.0,False,,,5.0,2.0
8490165,33104,443651.0,VD Brothers,,04/30/2022,26317979.0,26317979.0,False,,,2.0,3.0
8497268,33104,2122611.0,Kaoru Honda,,04/30/2022,26316626.0,26320550.0,False,,,31.0,19.0


In [60]:
subs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 547931 entries, 10446484 to 185617
Data columns (total 14 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Id                              547931 non-null  int64         
 1   SubmittedUserId                 547905 non-null  float64       
 2   TeamId                          547931 non-null  int64         
 3   SourceKernelVersionId           147722 non-null  float64       
 4   SubmissionDate                  547931 non-null  datetime64[ns]
 5   ScoreDate                       482526 non-null  object        
 6   IsAfterDeadline                 547931 non-null  bool          
 7   PublicScoreLeaderboardDisplay   539144 non-null  float64       
 8   Public                          539144 non-null  float64       
 9   PrivateScoreLeaderboardDisplay  535799 non-null  float64       
 10  Private                         535799 non-null  

In [150]:
comp_list = ['ranzcr-clip-catheter-line-classification', 
             'vinbigdata-chest-xray-abnormalities-detection', 
             'avito-demand-prediction',
             'talkingdata-adtracking-fraud-detection']

In [168]:
useful_cols = ['SubmissionDate',
               'Public',
               'Private',
               'TeamId',
               'TeamName']

for c in comp_list:
    cname = c.split('-')[0]
    globals()[cname] = subs.query(f"CompetitionSlug=='{c}'") # Raw DataFrame for the Competition
    df = globals()[cname]
    
    key_col = 'TeamId'
    last_day = df['SubmissionDate'].max()

    names = df[['TeamId', 'TeamName']].drop_duplicates().set_index('TeamId')
    score = df[df['SubmissionDate'] == last_day].groupby(key_col)['Private'].max().sort_values(ascending=False)

    submission = df.groupby(key_col)['SubmissionDate'].count().sort_values(ascending=False)
    submission.name = 'Submissions'

    first_day = df.groupby(key_col)['SubmissionDate'].min()
    first_day.name = 'FirstDay'

    last_day = df.groupby(key_col)['SubmissionDate'].max()
    last_day.name = 'LastDay'

    duration = last_day - first_day
    duration.name = 'Duration'

    summary = pd.concat([names, score, duration, first_day, last_day, submission],axis=1).sort_values(['Private', 'Duration', 'Submissions'], ascending=False)
    
    summary_name = cname + '_summary'
    globals()[summary_name] = summary # Summary of Rankings for the Competition
    # print(f'{summary_name} is saved')
    
    filtered_team = summary.index[:2]
    filtered_data = df[df['TeamId'].isin(filtered_team)][useful_cols]
    filtered_name = cname + '_filtered'
    globals()[filtered_name] = filtered_data
    print(f'{filtered_name} is saved')


ranzcr_filtered is saved
vinbigdata_filtered is saved
avito_filtered is saved
talkingdata_filtered is saved


In [147]:
ranzcr_summary

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6013849,All Data Are Ext,0.976732,48 days,2021-01-27,2021-03-16,22
6012213,リーマン面を這う,0.97642,90 days,2020-12-16,2021-03-16,171
6013117,Preferred CLiP,0.976248,90 days,2020-12-16,2021-03-16,119
6125018,Watercooled,0.97564,59 days,2021-01-16,2021-03-16,116
6012303,Guanshuo Xu,0.975635,48 days,2021-01-27,2021-03-16,28
6052677,DatNT,0.975137,76 days,2020-12-30,2021-03-16,125
6012271,RaddbotnaKama 200d,0.975069,91 days,2020-12-15,2021-03-16,200
6015339,bestfitting,0.974953,47 days,2021-01-28,2021-03-16,139
6233219,toxu,0.974553,20 days,2021-02-24,2021-03-16,110
6043189,Point97 L.P.,0.974539,73 days,2021-01-02,2021-03-16,96


In [148]:
vinbigdata_summary 

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6087561,SZI,0.311226,85 days,2021-01-04,2021-03-30,381
6090014,scumed,0.305267,81 days,2021-01-08,2021-03-30,448
6129731,fantastic_hirarin,0.30033,71 days,2021-01-18,2021-03-30,68
6087654,Kiet Chu,0.300266,78 days,2021-01-11,2021-03-30,140
6311035,Watercooled,0.299402,36 days,2021-02-22,2021-03-30,128
6095958,Guanshuo Xu,0.297697,7 days,2021-03-23,2021-03-30,8
6118346,ℳS²Ƒ,0.296918,86 days,2021-01-03,2021-03-30,366
6285350,CSM,0.296646,88 days,2021-01-01,2021-03-30,313
6410544,lung poem,0.295524,62 days,2021-01-27,2021-03-30,212
6088426,nvnn,0.292286,19 days,2021-03-11,2021-03-30,45


In [149]:
avito_summary

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1627465,eprst,0.841143,62 days,2018-04-26,2018-06-27,270
1653268,SuperAnova,0.222464,63 days,2018-04-25,2018-06-27,294
1697339,Debut_Kele,0.221417,36 days,2018-05-22,2018-06-27,70
1621917,Dance Dance Convolution,0.220837,50 days,2018-05-08,2018-06-27,295
1632225,Korob Ok,0.220695,61 days,2018-04-27,2018-06-27,163
1630705,we had great fun,0.21983,63 days,2018-04-25,2018-06-27,319
1654684,Cortexlabs&3Rookies,0.219777,61 days,2018-04-27,2018-06-27,321
1745014,RAM,0.21977,48 days,2018-05-10,2018-06-27,131
1668965,Dancing Flamenco & BonOdodi with LB,0.219566,46 days,2018-05-12,2018-06-27,249
1774281,HAIR,0.219563,50 days,2018-05-08,2018-06-27,195


In [152]:
talkingdata_summary

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1479247,"['flowlight', 'komaki'].shuffle()",0.984332,46 days,2018-03-22,2018-05-07,106
1462555,PPP is already in use,0.984126,57 days,2018-03-11,2018-05-07,146
1449364,bestfitting,0.984084,37 days,2018-03-31,2018-05-07,173
1512459,K.A.C.,0.984027,60 days,2018-03-08,2018-05-07,219
1485607,MMDP,0.984019,60 days,2018-03-08,2018-05-07,156
1499959,CPMP,0.983515,45 days,2018-03-23,2018-05-07,102
1460379,w&e,0.983448,29 days,2018-04-08,2018-05-07,71
1442367,[ods.ai] blenders,0.983376,60 days,2018-03-08,2018-05-07,238
1442366,Brute Force Attack,0.983271,61 days,2018-03-07,2018-05-07,145
1499394,Arsenal,0.983181,45 days,2018-03-23,2018-05-07,57


## For Loop Dev Testing

In [91]:
key_col = 'TeamId'
last_day = ranzcr['SubmissionDate'].max()

ranzcr_names = ranzcr[['TeamId', 'TeamName']].drop_duplicates().set_index('TeamId')
ranzcr_score = ranzcr[ranzcr['SubmissionDate'] == last_day].groupby(key_col)['Private'].max().sort_values(ascending=False)

In [157]:
ranzcr_pub_var = ranzcr.groupby(key_col)['Public'].var()
ranzcr_pub_var.name = 'PublicVar'

In [92]:
ranzcr_submission = ranzcr.groupby(key_col)['SubmissionDate'].count().sort_values(ascending=False)
ranzcr_submission.name = 'Submissions'

In [93]:
ranzcr_first_day = ranzcr.groupby(key_col)['SubmissionDate'].min()
ranzcr_first_day.name = 'FirstDay'

ranzcr_last_day = ranzcr.groupby(key_col)['SubmissionDate'].max()
ranzcr_last_day.name = 'LastDay'

ranzcr_duration = ranzcr_last_day - ranzcr_first_day
ranzcr_duration.name = 'Duration'

In [158]:
ranzcr_summary = pd.concat([ranzcr_names, ranzcr_score, ranzcr_pub_var, ranzcr_duration, ranzcr_first_day, ranzcr_last_day, ranzcr_submission],axis=1).sort_values(['Private', 'Duration', 'Submissions'], ascending=False)
# ranzcr_summary

In [159]:
ranzcr_summary

Unnamed: 0_level_0,TeamName,Private,PublicVar,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6013849,All Data Are Ext,0.976732,0.000584,48 days,2021-01-27,2021-03-16,22
6012213,リーマン面を這う,0.97642,0.002772,90 days,2020-12-16,2021-03-16,171
6013117,Preferred CLiP,0.976248,5.6e-05,90 days,2020-12-16,2021-03-16,119
6125018,Watercooled,0.97564,0.009351,59 days,2021-01-16,2021-03-16,116
6012303,Guanshuo Xu,0.975635,0.001052,48 days,2021-01-27,2021-03-16,28
6052677,DatNT,0.975137,1.1e-05,76 days,2020-12-30,2021-03-16,125
6012271,RaddbotnaKama 200d,0.975069,0.000176,91 days,2020-12-15,2021-03-16,200
6015339,bestfitting,0.974953,0.000335,47 days,2021-01-28,2021-03-16,139
6233219,toxu,0.974553,0.002803,20 days,2021-02-24,2021-03-16,110
6043189,Point97 L.P.,0.974539,2.7e-05,73 days,2021-01-02,2021-03-16,96


## Filtered Index Testing

In [165]:
ranzcr_filtered_team = ranzcr_summary.index[:2]
useful_cols = ['SubmissionDate',
               'Public',
               'Private',
               'TeamId',
               'TeamName']

In [166]:
ranzcr_filtered_data = ranzcr[ranzcr['TeamId'].isin(ranzcr_filtered_team)][useful_cols]

In [167]:
ranzcr_filtered_data

Unnamed: 0,SubmissionDate,Public,Private,TeamId,TeamName
8018725,2020-12-16,0.862191,0.870162,6012213,リーマン面を這う
8018726,2020-12-16,0.861558,0.868040,6012213,リーマン面を這う
8018844,2020-12-18,0.927493,0.935666,6012213,リーマン面を這う
8018845,2020-12-18,0.895836,0.898207,6012213,リーマン面を這う
8018721,2020-12-18,0.904338,0.909976,6012213,リーマン面を這う
...,...,...,...,...,...
8751234,2021-03-16,0.975586,0.976732,6013849,All Data Are Ext
8751235,2021-03-16,0.975616,0.976657,6013849,All Data Are Ext
8750940,2021-03-16,0.974120,0.976420,6012213,リーマン面を這う
8750941,2021-03-16,0.973660,0.975713,6012213,リーマン面を這う
