# Modified from https://www.kaggle.com/code/jtrotman/winning-team-submission-traces/notebook

In [1]:
import gc, os, sys, time
import pandas as pd, numpy as np
from unidecode import unidecode
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from IPython.display import HTML, display

### Original Code

In [2]:
# CSV_DIR = Path('..', 'input', 'meta-kaggle')
CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input', 'meta-kaggle')
if not CSV_DIR.is_dir():
    CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input')

def read_csv_filtered(csv, col, values):
    dfs = [df.loc[df[col].isin(values)]
           for df in pd.read_csv(CSV_DIR / csv, chunksize=100000, low_memory=False)]
    return pd.concat(dfs, axis=0)

comps = pd.read_csv(CSV_DIR / 'Competitions.csv').set_index('Id')
comps = comps.query("HostSegmentTitle != 'InClass'")
idx = comps.EvaluationAlgorithmName.isnull()
comps.loc[idx, 'EvaluationAlgorithmName'] = comps.loc[idx, 'EvaluationAlgorithmAbbreviation']

comps['EvaluationLabel'] = comps.EvaluationAlgorithmAbbreviation
idx = comps.EvaluationLabel.str.len() > 30
comps.loc[idx, 'EvaluationLabel'] = comps.loc[idx, 'EvaluationLabel'].str.replace(r'[^A-Z\d\-]', '', regex=True)

comps['DeadlineDate'] = pd.to_datetime(comps.DeadlineDate)
comps['EnabledDate'] = pd.to_datetime(comps.EnabledDate)
comps['DeadlineDateText'] = comps.DeadlineDate.dt.strftime('%c')
comps['EnabledDateText'] = comps.EnabledDate.dt.strftime('%c')
comps['Year'] = comps.DeadlineDate.dt.year
comps['RewardQuantity'].fillna('', inplace=True)
comps['Days'] = (comps.DeadlineDate - comps.EnabledDate) / pd.Timedelta(1, 'd')
comps['FinalWeek'] = (comps.DeadlineDate - pd.Timedelta(1, 'w'))

teams = read_csv_filtered('Teams.csv', 'CompetitionId', comps.index).set_index('Id')
# Just the winning teams (Change to different settings)
# teams = teams.query('PrivateLeaderboardRank==1').copy()
teams = teams.query('PrivateLeaderboardRank<=20').copy()

tmemb = read_csv_filtered('TeamMemberships.csv', 'TeamId', teams.index).set_index('Id')
users = read_csv_filtered('Users.csv', 'Id', tmemb.UserId)
tmemb = tmemb.merge(users, left_on='UserId', right_on='Id')

# Submissions
subs = read_csv_filtered('Submissions.csv', 'TeamId', tmemb.TeamId)
subs = subs.rename(columns={'PublicScoreFullPrecision': 'Public'})
subs = subs.rename(columns={'PrivateScoreFullPrecision': 'Private'})
subs['SubmissionDate'] = pd.to_datetime(subs.SubmissionDate)

asfloats = ['PublicScoreLeaderboardDisplay',
            'Public',
            'PrivateScoreLeaderboardDisplay',
            'Private',]

subs[asfloats] = subs[asfloats].astype(float)
# subs.IsAfterDeadline.mean()

subs = subs.query('not IsAfterDeadline').copy()
subs['CompetitionId'] = subs.TeamId.map(teams.CompetitionId)
subs['CompetitionSlug'] = subs.CompetitionId.map(comps.Slug)
subs['TeamName'] = subs.TeamId.map(teams.TeamName)

# subs['CompetitionId'].nunique()

# values some competitions use as invalid scores
for bad in [99, 999999]:
    for c in asfloats:
        idx = (subs[c] == bad)
        subs.loc[idx, c] = subs.loc[idx, c].replace({bad: np.nan})

# Display order: most recent competitions first
subs = subs.sort_values(['CompetitionId', 'Id'], ascending=[False, True])

### Modified Code
First specify the list of competitions by their slugs -- comp_list. The returned **subs** contain all submission info for the specified competitions

In [2]:
# Change the list of competitions of our choice
comp_list = ['ranzcr-clip-catheter-line-classification', 
             'vinbigdata-chest-xray-abnormalities-detection', 
             'avito-demand-prediction',
             'talkingdata-adtracking-fraud-detection'] 

In [3]:
# CSV_DIR = Path('..', 'input', 'meta-kaggle')
CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input', 'meta-kaggle')
if not CSV_DIR.is_dir():
    CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input')

def read_csv_filtered(csv, col, values):
    dfs = [df.loc[df[col].isin(values)]
           for df in pd.read_csv(CSV_DIR / csv, chunksize=100000, low_memory=False)]
    return pd.concat(dfs, axis=0)

comps = pd.read_csv(CSV_DIR / 'Competitions.csv').set_index('Id')
comps = comps.query("HostSegmentTitle != 'InClass'")
idx = comps.EvaluationAlgorithmName.isnull()
comps.loc[idx, 'EvaluationAlgorithmName'] = comps.loc[idx, 'EvaluationAlgorithmAbbreviation']

comps['EvaluationLabel'] = comps.EvaluationAlgorithmAbbreviation
idx = comps.EvaluationLabel.str.len() > 30
comps.loc[idx, 'EvaluationLabel'] = comps.loc[idx, 'EvaluationLabel'].str.replace(r'[^A-Z\d\-]', '', regex=True)

comps['DeadlineDate'] = pd.to_datetime(comps.DeadlineDate)
comps['EnabledDate'] = pd.to_datetime(comps.EnabledDate)
comps['DeadlineDateText'] = comps.DeadlineDate.dt.strftime('%c')
comps['EnabledDateText'] = comps.EnabledDate.dt.strftime('%c')
comps['Year'] = comps.DeadlineDate.dt.year
comps['RewardQuantity'].fillna('', inplace=True)
comps['Days'] = (comps.DeadlineDate - comps.EnabledDate) / pd.Timedelta(1, 'd')
comps['FinalWeek'] = (comps.DeadlineDate - pd.Timedelta(1, 'w'))
comp_id = comps[comps['Slug'].isin(comp_list)].index # Competition ID of our choice

teams = read_csv_filtered('Teams.csv', 'CompetitionId', comp_id).set_index('Id') # Teams that participate in these competitions
tmemb = read_csv_filtered('TeamMemberships.csv', 'TeamId', teams.index).set_index('Id')
users = read_csv_filtered('Users.csv', 'Id', tmemb.UserId)
tmemb = tmemb.merge(users, left_on='UserId', right_on='Id') # Some further cleaning of teams

# Submissions
subs = read_csv_filtered('Submissions.csv', 'TeamId', tmemb.TeamId) # Submission of these teams
subs = subs.rename(columns={'PublicScoreFullPrecision': 'Public'})
subs = subs.rename(columns={'PrivateScoreFullPrecision': 'Private'})
subs['SubmissionDate'] = pd.to_datetime(subs.SubmissionDate)

asfloats = ['PublicScoreLeaderboardDisplay',
            'Public',
            'PrivateScoreLeaderboardDisplay',
            'Private',]

subs[asfloats] = subs[asfloats].astype(float)

subs = subs.query('not IsAfterDeadline').copy()
subs['CompetitionId'] = subs.TeamId.map(teams.CompetitionId)
subs['CompetitionSlug'] = subs.CompetitionId.map(comps.Slug)
subs['TeamName'] = subs.TeamId.map(teams.TeamName)

# values some competitions use as invalid scores
for bad in [99, 999999]:
    for c in asfloats:
        idx = (subs[c] == bad)
        subs.loc[idx, c] = subs.loc[idx, c].replace({bad: np.nan})

# Display order: most recent competitions first
subs = subs.sort_values(['CompetitionId', 'Id'], ascending=[False, True])

In [4]:
teams.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61031 entries, 1370425 to 8605603
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   CompetitionId                   61031 non-null  int64  
 1   TeamLeaderId                    61023 non-null  float64
 2   TeamName                        61024 non-null  object 
 3   ScoreFirstSubmittedDate         0 non-null      float64
 4   LastSubmissionDate              8675 non-null   object 
 5   PublicLeaderboardSubmissionId   8637 non-null   float64
 6   PrivateLeaderboardSubmissionId  8637 non-null   float64
 7   IsBenchmark                     61031 non-null  bool   
 8   Medal                           881 non-null    float64
 9   MedalAwardDate                  886 non-null    object 
 10  PublicLeaderboardRank           8633 non-null   float64
 11  PrivateLeaderboardRank          8633 non-null   float64
dtypes: bool(1), float64(7), 

In [5]:
subs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 166402 entries, 8103804 to 3998737
Data columns (total 14 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Id                              166402 non-null  int64         
 1   SubmittedUserId                 166402 non-null  float64       
 2   TeamId                          166402 non-null  int64         
 3   SourceKernelVersionId           55221 non-null   float64       
 4   SubmissionDate                  166402 non-null  datetime64[ns]
 5   ScoreDate                       164874 non-null  object        
 6   IsAfterDeadline                 166402 non-null  bool          
 7   PublicScoreLeaderboardDisplay   164986 non-null  float64       
 8   Public                          164986 non-null  float64       
 9   PrivateScoreLeaderboardDisplay  164986 non-null  float64       
 10  Private                         164986 non-null  

### This chunk is to save data (globally) that 
1. summarize relevant information of respective competitions (**XXX_summary**), and 
2. filter the teams that satisfy our criteria (**XXX_filtered**)

In [6]:
useful_cols = ['SubmissionDate',
               'Public',
               'Private',
               'TeamId',
               'TeamName']

for c in comp_list:
    cname = c.split('-')[0]
    df = globals()[cname] = subs.query(f"CompetitionSlug=='{c}'") # Raw DataFrame for the Competition
    
    key_col = 'TeamId'
    last_day = df['SubmissionDate'].max()

    names = df[['TeamId', 'TeamName']].drop_duplicates().set_index('TeamId')
    score = df[df['SubmissionDate'] == last_day].groupby(key_col)['Private'].max().sort_values(ascending=False)

    submission = df.groupby(key_col)['SubmissionDate'].count().sort_values(ascending=False)
    submission.name = 'Submissions'

    first_day = df.groupby(key_col)['SubmissionDate'].min()
    first_day.name = 'FirstDay'

    last_day = df.groupby(key_col)['SubmissionDate'].max()
    last_day.name = 'LastDay'

    duration = last_day - first_day
    duration.name = 'Duration'

    summary = pd.concat([names, score, duration, first_day, last_day, submission],axis=1).sort_values(['Private', 'Duration', 'Submissions'], ascending=False)
    summary = summary.dropna(axis=0)
    
    summary_name = cname + '_summary'
    globals()[summary_name] = summary # Summary of Rankings for the Competition
    # print(f'{summary_name} is saved')
    
    filtered_team = summary.index[:2] # Need Modification for our criteria
    filtered_data = df[df['TeamId'].isin(filtered_team)][useful_cols]
    filtered_name = cname + '_filtered'
    globals()[filtered_name] = filtered_data
    print(f'{filtered_name} is saved')

ranzcr_filtered is saved
vinbigdata_filtered is saved
avito_filtered is saved
talkingdata_filtered is saved


### Summary Saved

In [7]:
ranzcr_summary

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6013849,All Data Are Ext,0.976732,48 days,2021-01-27,2021-03-16,22
6012213,リーマン面を這う,0.976420,90 days,2020-12-16,2021-03-16,171
6013117,Preferred CLiP,0.976248,90 days,2020-12-16,2021-03-16,119
6125018,Watercooled,0.975640,59 days,2021-01-16,2021-03-16,116
6012303,Guanshuo Xu,0.975635,48 days,2021-01-27,2021-03-16,28
...,...,...,...,...,...,...
6361237,Turing526,0.500000,10 days,2021-03-06,2021-03-16,20
6334897,polaris36,0.500000,3 days,2021-03-13,2021-03-16,19
6368988,zhiming25,0.500000,1 days,2021-03-15,2021-03-16,10
6325273,Berkay Yzeka,0.500000,0 days,2021-03-16,2021-03-16,1


In [8]:
vinbigdata_summary 

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6087561,SZI,0.311226,85 days,2021-01-04,2021-03-30,381
6090014,scumed,0.305267,81 days,2021-01-08,2021-03-30,448
6129731,fantastic_hirarin,0.300330,71 days,2021-01-18,2021-03-30,68
6087654,Kiet Chu,0.300266,78 days,2021-01-11,2021-03-30,140
6311035,Watercooled,0.299402,36 days,2021-02-22,2021-03-30,128
...,...,...,...,...,...,...
6214791,Cetian Liu,0.033298,45 days,2021-02-13,2021-03-30,16
6186098,zen xan,0.018128,43 days,2021-02-15,2021-03-30,33
6323121,E2 NextGen,0.001466,2 days,2021-03-28,2021-03-30,8
6092531,Elga,0.000295,37 days,2021-02-21,2021-03-30,10


In [9]:
avito_summary

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1627465,eprst,0.841143,62 days,2018-04-26,2018-06-27,270
1781096,Charles Pieri,0.765664,5 days,2018-06-22,2018-06-27,10
1667967,Yuri055,0.577212,50 days,2018-05-08,2018-06-27,27
1684582,OlegLapin,0.478266,0 days,2018-06-27,2018-06-27,2
1681298,B,0.458187,32 days,2018-05-26,2018-06-27,33
...,...,...,...,...,...,...
1665104,Dmitry Larko,0.218630,50 days,2018-05-08,2018-06-27,64
1634704,Light in June,0.218617,62 days,2018-04-26,2018-06-27,231
1623501,wave in the distance at the top,0.218230,49 days,2018-05-09,2018-06-27,148
1624500,Song and Dance Ensemble,0.217643,63 days,2018-04-25,2018-06-27,207


In [10]:
talkingdata_summary

Unnamed: 0_level_0,TeamName,Private,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1479247,"['flowlight', 'komaki'].shuffle()",0.984332,46 days,2018-03-22,2018-05-07,106
1462555,PPP is already in use,0.984126,57 days,2018-03-11,2018-05-07,146
1449364,bestfitting,0.984084,37 days,2018-03-31,2018-05-07,173
1512459,K.A.C.,0.984027,60 days,2018-03-08,2018-05-07,219
1485607,MMDP,0.984019,60 days,2018-03-08,2018-05-07,156
...,...,...,...,...,...,...
1567509,Yulong,0.500000,0 days,2018-05-07,2018-05-07,1
1620924,mourad bahani,0.500000,0 days,2018-05-07,2018-05-07,1
1520369,herici,0.481827,29 days,2018-04-08,2018-05-07,31
1453675,YILANNN,0.429972,0 days,2018-05-07,2018-05-07,1


### Data Saved

In [11]:
ranzcr_filtered

Unnamed: 0,SubmissionDate,Public,Private,TeamId,TeamName
8018725,2020-12-16,0.862191,0.870162,6012213,リーマン面を這う
8018726,2020-12-16,0.861558,0.868040,6012213,リーマン面を這う
8018844,2020-12-18,0.927493,0.935666,6012213,リーマン面を這う
8018845,2020-12-18,0.895836,0.898207,6012213,リーマン面を這う
8018721,2020-12-18,0.904338,0.909976,6012213,リーマン面を這う
...,...,...,...,...,...
8751234,2021-03-16,0.975586,0.976732,6013849,All Data Are Ext
8751235,2021-03-16,0.975616,0.976657,6013849,All Data Are Ext
8750940,2021-03-16,0.974120,0.976420,6012213,リーマン面を這う
8750941,2021-03-16,0.973660,0.975713,6012213,リーマン面を這う


In [12]:
vinbigdata_filtered

Unnamed: 0,SubmissionDate,Public,Private,TeamId,TeamName
8183501,2021-01-04,0.164498,0.159467,6087561,SZI
8183502,2021-01-06,0.124063,0.146764,6087561,SZI
8183503,2021-01-06,0.117363,0.133969,6087561,SZI
8380861,2021-01-07,0.172138,0.167738,6087561,SZI
8380862,2021-01-07,0.160037,0.164868,6087561,SZI
...,...,...,...,...,...
8705991,2021-03-30,0.316657,0.291157,6090014,scumed
8628181,2021-03-30,0.296349,0.305256,6087561,SZI
8628179,2021-03-30,0.287484,0.299893,6087561,SZI
8628183,2021-03-30,0.296358,0.304209,6087561,SZI


In [13]:
avito_filtered

Unnamed: 0,SubmissionDate,Public,Private,TeamId,TeamName
3740838,2018-04-26,0.283540,0.286429,1627465,eprst
3740839,2018-04-26,0.236018,0.239847,1627465,eprst
3741243,2018-04-26,0.236021,0.239831,1627465,eprst
3740837,2018-04-26,0.236192,0.240035,1627465,eprst
3740840,2018-04-26,0.318184,0.320441,1627465,eprst
...,...,...,...,...,...
3743621,2018-06-27,0.769680,0.765664,1781096,Charles Pieri
3741250,2018-06-27,0.215835,0.219503,1627465,eprst
3741255,2018-06-27,0.225301,0.229674,1627465,eprst
3741256,2018-06-27,0.226653,0.229937,1627465,eprst


In [14]:
talkingdata_filtered

Unnamed: 0,SubmissionDate,Public,Private,TeamId,TeamName
3428010,2018-03-11,0.963049,0.962580,1462555,PPP is already in use
3428005,2018-03-11,0.954556,0.955195,1462555,PPP is already in use
3428006,2018-03-13,0.960498,0.962559,1462555,PPP is already in use
3428007,2018-03-13,0.964039,0.963028,1462555,PPP is already in use
3428008,2018-03-13,0.962027,0.963548,1462555,PPP is already in use
...,...,...,...,...,...
3968958,2018-05-07,0.983438,0.984322,1479247,"['flowlight', 'komaki'].shuffle()"
3923700,2018-05-07,0.983309,0.984113,1462555,PPP is already in use
3923701,2018-05-07,0.983317,0.984126,1462555,PPP is already in use
3923698,2018-05-07,0.983334,0.984014,1462555,PPP is already in use


### Data Manipulation for Paper Requirements

In [15]:
vinbigdata_filtered

Unnamed: 0,SubmissionDate,Public,Private,TeamId,TeamName
8183501,2021-01-04,0.164498,0.159467,6087561,SZI
8183502,2021-01-06,0.124063,0.146764,6087561,SZI
8183503,2021-01-06,0.117363,0.133969,6087561,SZI
8380861,2021-01-07,0.172138,0.167738,6087561,SZI
8380862,2021-01-07,0.160037,0.164868,6087561,SZI
...,...,...,...,...,...
8705991,2021-03-30,0.316657,0.291157,6090014,scumed
8628181,2021-03-30,0.296349,0.305256,6087561,SZI
8628179,2021-03-30,0.287484,0.299893,6087561,SZI
8628183,2021-03-30,0.296358,0.304209,6087561,SZI


In [45]:
vinbigdata_filtered['t'] = vinbigdata_filtered.SubmissionDate.dt.week
week0 = vinbigdata_filtered['t'].min() - 1
vinbigdata_filtered['t'] = vinbigdata_filtered['t'] - week0
print(vinbigdata_filtered.t.max())
print(vinbigdata_filtered.t.min())

13
1


  """Entry point for launching an IPython kernel.


In [51]:
q_x = vinbigdata_filtered.groupby(by=['TeamId', 't']).agg(q =pd.NamedAgg('SubmissionDate', aggfunc='count'),
                                       x = pd.NamedAgg('Private', aggfunc='max'),
                                       public = pd.NamedAgg('Public', aggfunc='max'))
# q_x.reset_index().sort_values(['TeamId', 't'])
q_x

Unnamed: 0_level_0,Unnamed: 1_level_0,q,x,public
TeamId,t,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6087561,1,10,0.186076,0.172138
6087561,2,17,0.206282,0.16844
6087561,3,35,0.286489,0.284101
6087561,4,51,0.289421,0.282399
6087561,5,28,0.289577,0.300183
6087561,6,32,0.296527,0.301066
6087561,7,32,0.301086,0.309335
6087561,8,31,0.304616,0.335853
6087561,9,35,0.310151,0.341758
6087561,10,30,0.310412,0.342794


In [79]:
id_list = np.unique([x[0] for x in q_x.index])

q_i = q_x.loc[id_list[0]]['q']
q_j = q_x.loc[id_list[1]]['q']

x_i = q_x.loc[id_list[0]]['x']
x_j = q_x.loc[id_list[1]]['x']

y_i = x_i - x_j
y_j = x_j - x_i

Z = q_x.loc[id_list[0]]['public'] - q_x.loc[id_list[1]]['public']
t = np.arange(q_i.size) + 1
# t = t.astype('int64')
cols = ['q_i', 'q_j', 'x_i', 'x_j', 'y_i', 'y_j', 'Z']

cleaned_data = pd.DataFrame(data=np.array([q_i, q_j, x_i, x_j, y_i, y_j, Z]).T, columns=cols)
cleaned_data['t'] = t
cleaned_data

Unnamed: 0,q_i,q_j,x_i,x_j,y_i,y_j,Z,t
0,10.0,3.0,0.186076,0.0971,0.088976,-0.088976,0.072809,1
1,17.0,44.0,0.206282,0.221564,-0.015282,0.015282,-0.067411,2
2,35.0,35.0,0.286489,0.22935,0.057139,-0.057139,0.026582,3
3,51.0,57.0,0.289421,0.268236,0.021185,-0.021185,0.010364,4
4,28.0,55.0,0.289577,0.272836,0.016741,-0.016741,0.019955,5
5,32.0,45.0,0.296527,0.265559,0.030969,-0.030969,0.022077,6
6,32.0,40.0,0.301086,0.253794,0.047292,-0.047292,0.027714,7
7,31.0,26.0,0.304616,0.264398,0.040218,-0.040218,0.085003,8
8,35.0,35.0,0.310151,0.299296,0.010855,-0.010855,0.020397,9
9,30.0,31.0,0.310412,0.298688,0.011724,-0.011724,0.002285,10


In [86]:
cleaned_data.to_csv('vinbigdata_cleaned.csv', index=False)

#### Diff data (can be done in R as well, try to keep raw data at this stage)

In [84]:
nabla_cols = cols[2:]
nabla_names = ['diff_' + x for x in nabla_cols]

In [85]:
nabla_df = cleaned_data[nabla_cols].diff().dropna()
nabla_df.columns = nabla_names
nabla_df

Unnamed: 0,diff_x_i,diff_x_j,diff_y_i,diff_y_j,diff_Z
1,0.020206,0.124464,-0.104258,0.104258,-0.14022
2,0.080207,0.007786,0.07242,-0.07242,0.093993
3,0.002932,0.038886,-0.035954,0.035954,-0.016218
4,0.000156,0.0046,-0.004444,0.004444,0.00959
5,0.00695,-0.007277,0.014228,-0.014228,0.002122
6,0.004559,-0.011765,0.016324,-0.016324,0.005637
7,0.00353,0.010605,-0.007075,0.007075,0.057289
8,0.005535,0.034898,-0.029363,0.029363,-0.064606
9,0.000261,-0.000608,0.000869,-0.000869,-0.018112
10,-0.001042,0.006851,-0.007893,0.007893,0.003894


## For Loop Dev Testing

In [91]:
key_col = 'TeamId'
last_day = ranzcr['SubmissionDate'].max()

ranzcr_names = ranzcr[['TeamId', 'TeamName']].drop_duplicates().set_index('TeamId')
ranzcr_score = ranzcr[ranzcr['SubmissionDate'] == last_day].groupby(key_col)['Private'].max().sort_values(ascending=False)

In [157]:
ranzcr_pub_var = ranzcr.groupby(key_col)['Public'].var()
ranzcr_pub_var.name = 'PublicVar'

In [92]:
ranzcr_submission = ranzcr.groupby(key_col)['SubmissionDate'].count().sort_values(ascending=False)
ranzcr_submission.name = 'Submissions'

In [93]:
ranzcr_first_day = ranzcr.groupby(key_col)['SubmissionDate'].min()
ranzcr_first_day.name = 'FirstDay'

ranzcr_last_day = ranzcr.groupby(key_col)['SubmissionDate'].max()
ranzcr_last_day.name = 'LastDay'

ranzcr_duration = ranzcr_last_day - ranzcr_first_day
ranzcr_duration.name = 'Duration'

In [158]:
ranzcr_summary = pd.concat([ranzcr_names, ranzcr_score, ranzcr_pub_var, ranzcr_duration, ranzcr_first_day, ranzcr_last_day, ranzcr_submission],axis=1).sort_values(['Private', 'Duration', 'Submissions'], ascending=False)
# ranzcr_summary

In [159]:
ranzcr_summary

Unnamed: 0_level_0,TeamName,Private,PublicVar,Duration,FirstDay,LastDay,Submissions
TeamId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6013849,All Data Are Ext,0.976732,0.000584,48 days,2021-01-27,2021-03-16,22
6012213,リーマン面を這う,0.97642,0.002772,90 days,2020-12-16,2021-03-16,171
6013117,Preferred CLiP,0.976248,5.6e-05,90 days,2020-12-16,2021-03-16,119
6125018,Watercooled,0.97564,0.009351,59 days,2021-01-16,2021-03-16,116
6012303,Guanshuo Xu,0.975635,0.001052,48 days,2021-01-27,2021-03-16,28
6052677,DatNT,0.975137,1.1e-05,76 days,2020-12-30,2021-03-16,125
6012271,RaddbotnaKama 200d,0.975069,0.000176,91 days,2020-12-15,2021-03-16,200
6015339,bestfitting,0.974953,0.000335,47 days,2021-01-28,2021-03-16,139
6233219,toxu,0.974553,0.002803,20 days,2021-02-24,2021-03-16,110
6043189,Point97 L.P.,0.974539,2.7e-05,73 days,2021-01-02,2021-03-16,96


## Filtered Index Testing

In [165]:
ranzcr_filtered_team = ranzcr_summary.index[:2]
useful_cols = ['SubmissionDate',
               'Public',
               'Private',
               'TeamId',
               'TeamName']

In [166]:
ranzcr_filtered_data = ranzcr[ranzcr['TeamId'].isin(ranzcr_filtered_team)][useful_cols]

In [167]:
ranzcr_filtered_data

Unnamed: 0,SubmissionDate,Public,Private,TeamId,TeamName
8018725,2020-12-16,0.862191,0.870162,6012213,リーマン面を這う
8018726,2020-12-16,0.861558,0.868040,6012213,リーマン面を這う
8018844,2020-12-18,0.927493,0.935666,6012213,リーマン面を這う
8018845,2020-12-18,0.895836,0.898207,6012213,リーマン面を這う
8018721,2020-12-18,0.904338,0.909976,6012213,リーマン面を這う
...,...,...,...,...,...
8751234,2021-03-16,0.975586,0.976732,6013849,All Data Are Ext
8751235,2021-03-16,0.975616,0.976657,6013849,All Data Are Ext
8750940,2021-03-16,0.974120,0.976420,6012213,リーマン面を這う
8750941,2021-03-16,0.973660,0.975713,6012213,リーマン面を這う
