In [5]:
import gc, os, sys, time
import pandas as pd, numpy as np
from unidecode import unidecode
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from IPython.display import HTML, display
import warnings
warnings.filterwarnings("ignore")

In [2]:
n_comp = 100 # Specify the number of competitions

# CSV_DIR = Path('..', 'input', 'meta-kaggle')
CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input', 'meta-kaggle')
if not CSV_DIR.is_dir():
    CSV_DIR = Path('D:/Academics/Research/IORA/Game Designer/Data Files/', 'input')

def read_csv_filtered(csv, col, values):
    dfs = [df.loc[df[col].isin(values)]
           for df in pd.read_csv(CSV_DIR / csv, chunksize=100000, low_memory=False)]
    return pd.concat(dfs, axis=0)

comps = pd.read_csv(CSV_DIR / 'Competitions.csv').set_index('Id')
comps = comps.query("HostSegmentTitle != 'InClass'")
idx = comps.EvaluationAlgorithmName.isnull()
comps.loc[idx, 'EvaluationAlgorithmName'] = comps.loc[idx, 'EvaluationAlgorithmAbbreviation']

comps['EvaluationLabel'] = comps.EvaluationAlgorithmAbbreviation
idx = comps.EvaluationLabel.str.len() > 30
comps.loc[idx, 'EvaluationLabel'] = comps.loc[idx, 'EvaluationLabel'].str.replace(r'[^A-Z\d\-]', '', regex=True)

comps['DeadlineDate'] = pd.to_datetime(comps.DeadlineDate)
comps['EnabledDate'] = pd.to_datetime(comps.EnabledDate)
comps['DeadlineDateText'] = comps.DeadlineDate.dt.strftime('%c')
comps['EnabledDateText'] = comps.EnabledDate.dt.strftime('%c')
comps['Year'] = comps.DeadlineDate.dt.year
comps['RewardQuantity'].fillna('', inplace=True)
comps['Days'] = (comps.DeadlineDate - comps.EnabledDate) / pd.Timedelta(1, 'd')
comps['FinalWeek'] = (comps.DeadlineDate - pd.Timedelta(1, 'w'))

comp_id = comps.sort_values(by=['TotalSubmissions'], ascending=False).index[:n_comp] # Top n competitions ranked by total number of submissions (hotness)

teams = read_csv_filtered('Teams.csv', 'CompetitionId', comp_id).set_index('Id') # Teams that participate in these competitions
tmemb = read_csv_filtered('TeamMemberships.csv', 'TeamId', teams.index).set_index('Id')
users = read_csv_filtered('Users.csv', 'Id', tmemb.UserId)
tmemb = tmemb.merge(users, left_on='UserId', right_on='Id') # Some further cleaning of teams

# Submissions
subs = read_csv_filtered('Submissions.csv', 'TeamId', tmemb.TeamId) # Submission of these teams
subs = subs.rename(columns={'PublicScoreFullPrecision': 'Public'})
subs = subs.rename(columns={'PrivateScoreFullPrecision': 'Private'})
subs['SubmissionDate'] = pd.to_datetime(subs.SubmissionDate)

asfloats = ['PublicScoreLeaderboardDisplay',
            'Public',
            'PrivateScoreLeaderboardDisplay',
            'Private',]

subs[asfloats] = subs[asfloats].astype(float)

subs = subs.query('not IsAfterDeadline').copy()
subs['CompetitionId'] = subs.TeamId.map(teams.CompetitionId)
subs['CompetitionSlug'] = subs.CompetitionId.map(comps.Slug)
subs['TeamName'] = subs.TeamId.map(teams.TeamName)

# values some competitions use as invalid scores
for bad in [99, 999999]:
    for c in asfloats:
        idx = (subs[c] == bad)
        subs.loc[idx, c] = subs.loc[idx, c].replace({bad: np.nan})

# Display order: most recent competitions first
subs = subs.sort_values(['CompetitionId', 'Id'], ascending=[False, True])

In [3]:
subs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4585679 entries, 10106800 to 191112
Data columns (total 14 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   Id                              int64         
 1   SubmittedUserId                 float64       
 2   TeamId                          int64         
 3   SourceKernelVersionId           float64       
 4   SubmissionDate                  datetime64[ns]
 5   ScoreDate                       object        
 6   IsAfterDeadline                 bool          
 7   PublicScoreLeaderboardDisplay   float64       
 8   Public                          float64       
 9   PrivateScoreLeaderboardDisplay  float64       
 10  Private                         float64       
 11  CompetitionId                   int64         
 12  CompetitionSlug                 object        
 13  TeamName                        object        
dtypes: bool(1), datetime64[ns](1), float64(6), i

In [21]:
useful_cols = ['SubmissionDate',
               'Public',
               'Private',
               'TeamId',
               'TeamName']

res_df = pd.DataFrame(columns=['n_i', 'x_i', 'n_j', 'x_j', 'y_i','dZ'])
comp_df = pd.DataFrame(columns=['T', 'theta', 'perc'])
comp_length_list = []
prize_list = []

n = 0

for c_id in comp_id:
    df = subs.query(f"CompetitionId=={c_id}") # Raw DataFrame for the Competition
    slug = comps.loc[c_id]['Slug']
    prize = comps.loc[c_id]['RewardQuantity']
    if type(prize) != float:
        print(f'Non-numerical prize: {slug} -- {prize}')
        continue
    prize = prize / 1000
    data_amt = comps.loc[c_id]['LeaderboardPercentage'] / 100
    
    key_col = 'TeamId'
    last_day = df['SubmissionDate'].max()

    names = df[['TeamId', 'TeamName']].drop_duplicates().set_index('TeamId')
    # score = df[df['SubmissionDate'] == last_day].groupby(key_col)['Private'].max().sort_values(ascending=False)
    score = df.groupby(key_col)['Private'].max().sort_values(ascending=False)

    submission = df.groupby(key_col)['SubmissionDate'].count().sort_values(ascending=False)
    submission.name = 'Submissions'

    first_day = df.groupby(key_col)['SubmissionDate'].min()
    first_day.name = 'FirstDay'

    last_day = df.groupby(key_col)['SubmissionDate'].max()
    last_day.name = 'LastDay'

    duration = last_day - first_day
    duration.name = 'Duration'
    
    summary = pd.concat([names, score, duration, first_day, last_day, submission],axis=1).sort_values(['Private', 'Duration', 'Submissions'], ascending=False)
    summary = summary.dropna(axis=0)

    filtered_team = summary.index[:2] # Need Modification for our criteria
    filtered_data = df[df['TeamId'].isin(filtered_team)][useful_cols]
    
    if (filtered_data['Private'].values > 1).any():
        print(f'Private score > 1: {slug}')
        continue

    start = filtered_data.groupby('TeamId')['SubmissionDate'].min().max()

    filtered_data['t'] = (filtered_data.SubmissionDate - start).dt.days // 7 + 1
    n_x = filtered_data.groupby(by=['TeamId', 't']).agg(n =pd.NamedAgg('SubmissionDate', aggfunc='count'),
                                       x = pd.NamedAgg('Private', aggfunc='max'),
                                       public = pd.NamedAgg('Public', aggfunc='max'))
    df_i = n_x.loc[filtered_team[0]]
    df_j = n_x.loc[filtered_team[1]]
    df_full = df_i.merge(df_j,left_index=True,right_index=True,how='outer', suffixes=('_i','_j'))
    df_full[['n_i', 'n_j']] = df_full[['n_i', 'n_j']].fillna(0)
    df_full[['x_i', 'x_j', 'public_i', 'public_j']] = df_full[['x_i', 'x_j', 'public_i', 'public_j']].fillna(method='ffill')
    df_full['y_i'] = df_full['x_i'] - df_full['x_j']
    df_full['dZ'] = df_full['public_i'] - df_full['public_j']
    df_full.drop(columns=['public_i', 'public_j'], inplace=True)
    cleaned_data = df_full[df_full.index > 0]

    # Book Keeping
    comp_len = cleaned_data.shape[0]
    if comp_len <= 2:
        print(f'Too few periods: {slug}')
        continue
    
    res_df = pd.concat([res_df, cleaned_data])
    comp_df = comp_df.append({'Slug': slug,'T': comp_len, 'theta': prize, 'perc': data_amt}, ignore_index=True)
    n+=1

print(f'Total Competitions Saved: {n}')

res_df.to_csv('cleaned_data.csv', index=False)
comp_df.to_csv('comp_info.csv', index=False)

Private score > 1: m5-forecasting-accuracy
Private score > 1: lish-moa
Private score > 1: elo-merchant-category-recommendation
Too few periods: cassava-leaf-disease-classification
Too few periods: mercedes-benz-greener-manufacturing
Private score > 1: commonlitreadabilityprize
Private score > 1: rossmann-store-sales
Private score > 1: zillow-prize-1
Private score > 1: sberbank-russian-housing-market
Private score > 1: LANL-Earthquake-Prediction
Private score > 1: santander-value-prediction-challenge
Private score > 1: bnp-paribas-cardif-claims-management
Private score > 1: quora-question-pairs
Private score > 1: allstate-claims-severity
Non-numerical prize: rock-paper-scissors -- 
Private score > 1: champs-scalar-coupling
Private score > 1: petfinder-pawpularity-score
Private score > 1: two-sigma-connect-rental-listing-inquiries
Private score > 1: ventilator-pressure-prediction
Private score > 1: otto-group-product-classification-challenge
Private score > 1: statoil-iceberg-classifier-

In [22]:
res_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454 entries, 1 to 5
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   n_i     454 non-null    object
 1   x_i     453 non-null    object
 2   n_j     454 non-null    object
 3   x_j     454 non-null    object
 4   y_i     453 non-null    object
 5   dZ      453 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


In [23]:
comp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   T       57 non-null     object
 1   theta   57 non-null     object
 2   perc    57 non-null     object
 3   Slug    57 non-null     object
dtypes: object(4)
memory usage: 1.9+ KB


In [24]:
res_df

Unnamed: 0,n_i,x_i,n_j,x_j,y_i,dZ
1,21.0,0.796027,12.0,0.792259,0.003768,0.006153
2,18.0,0.797489,17.0,0.789092,0.008397,0.010264
3,37.0,0.794691,18.0,0.792721,0.00197,0.003882
4,58.0,0.796496,15.0,0.795739,0.000757,0.002974
5,39.0,0.798616,29.0,0.795251,0.003366,0.001177
...,...,...,...,...,...,...
1,8,0.937602,17.0,0.931157,0.006445,0.005935
2,2,0.940893,11.0,0.944245,-0.003352,-0.002266
3,18,0.949953,9.0,0.944459,0.005495,0.007017
4,30,0.953853,20.0,0.948761,0.005092,0.005461


In [26]:
comp_df.head(10)

Unnamed: 0,T,theta,perc,Slug
0,14,70.0,0.2,home-credit-default-risk
1,11,20.0,0.2,ieee-fraud-detection
2,7,65.0,0.25,santander-customer-transaction-prediction
3,7,30.0,0.3,siim-isic-melanoma-classification
4,9,25.0,0.3,porto-seguro-safe-driver-prediction
5,9,60.0,0.5,santander-customer-satisfaction
6,6,35.0,0.1,jigsaw-toxic-comment-classification-challenge
7,5,160.0,0.14,data-science-bowl-2019
8,13,100.0,0.33,tgs-salt-identification-challenge
9,9,50.0,0.15,aptos2019-blindness-detection
