In [4]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import scipy
import psycopg2
import tensorflow as ts
from collections import defaultdict

con = psycopg2.connect(database='codeforces', user='Joy')
cur = con.cursor()

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 10.0)
plt.rcParams['figure.facecolor'] = 'white'

In [5]:
from sqlalchemy import create_engine
engine = create_engine('postgres://%s@localhost/%s'%('Joy', 'codeforces'))

# create Y values

In [6]:
# note this is 4x faster than getting it from sql
df_smooth = pd.read_csv('user_ratings_smoothed.csv', engine = 'c')

## calculate difference
Only need to run this once
```
gusr = df_smooth.groupby('handle')
stack = []

for usr, dfu in gusr:
    dfu.is_copy=False
    dfu.sort_values('ratingupdatetimeseconds', inplace=True)
    stack.append(dfu)

df_smooth = pd.concat(stack)
for month in range(1, 6):
    curr = df_smooth["smoothed_%dmonths" % month]
    prev = np.roll(curr, 1)

    delta = curr - prev
    df_smooth["delta_smoothed_%dmonths" % month] = delta

df_smooth.head(50)

## output to sql and csv

df_smooth.to_csv('user_ratings_smoothed.csv', index=False, header=True)

df_smooth.to_sql('user_rating_smooth', engine, if_exists='replace')
```

# Features
 **problem type**
 * contest
 * virtual
 * etc
 
**problem info**
 * tags
 * rating
 * point value
 
**submission info**
 * number of wrong answers
 * number of TLE
 * number of compile errors
 * time between first submission and solve
 * relative time to competition
 
**user info**
 * current smooth rating
 * volatility?
 * lag can be estimated from user rating and smoothed rating, but do we want it??

## Getting distinct values for categorial variables

In [7]:
cur.execute("SELECT * FROM handles")
all_handles = [h[0] for h in cur.fetchall()]

Run this code **once only** to get a list of keys. This takes ~20 minutes

```
q = """
SELECT DISTINCT handle, contestid, problemid FROM submissions;
"""
cur.execute(q)
keys = cur.fetchall()

with open('handle_cid_pid_keys.txt', 'w') as f:
    for k in keys:
        f.write(','.join(k) + '\n')
```

In [8]:
with open('handle_cid_pid_keys.txt') as f:
    keys = [line.strip() for line in f.readlines()]

### Get all tags
Only need to run this once:
```
cur.execute("""
SELECT DISTINCT tag FROM tags
""")
all_tags = [t[0] for t in cur.fetchall()]

df_all_tags = pd.DataFrame(all_tags)
df_all_tags.rename_axis({0: 'tag'}, axis=1, inplace=True)
df_all_tags.to_sql('all_tags', engine, if_exists='replace')
```

In [9]:
cur.execute("SELECT tag FROM all_tags")
all_tags = set([t[0] for t in cur.fetchall()])

### Get distinct verdicts

Only need to run this once:
```
cur.execute("""
SELECT DISTINCT verdict FROM submissions
""")
all_verdicts = [v[0] for v in cur.fetchall()]

df_all_verdicts = pd.DataFrame(all_verdicts)
df_all_verdicts.rename_axis({0: 'verdict'}, axis=1, inplace=True)
df_all_verdicts.to_sql('all_verdicts', engine, if_exists='replace')
```

In [10]:
cur.execute("SELECT verdict FROM all_verdicts")
all_verdicts = set([t[0] for t in cur.fetchall()])

### Get distinct participant types
```
cur.execute("""
SELECT DISTINCT participanttype FROM submissions
""")
all_participanttypes = [v[0] for v in cur.fetchall()]

df_all_participanttypes = pd.DataFrame(all_participanttypes)
df_all_participanttypes.rename_axis({0: 'participanttype'}, axis=1, inplace=True)
df_all_participanttypes.to_sql('all_participanttypes', engine, if_exists='replace')
```

In [11]:
cur.execute("SELECT participanttype FROM all_participanttypes")
all_participanttypes = set([t[0] for t in cur.fetchall()])

### programming languages
```
cur.execute("""
SELECT DISTINCT language FROM submissions
""")
all_language = [v[0] for v in cur.fetchall()]

df_all_language = pd.DataFrame(all_language)
df_all_language.rename_axis({0: 'language'}, axis=1, inplace=True)
df_all_language.to_sql('all_language', engine, if_exists='replace')
```

In [16]:
cur.execute("SELECT language FROM all_language")
all_language = set([t[0] for t in cur.fetchall()])

## problem stats

### problem rating and tags

In [12]:
df_prate = pd.read_sql("SELECT * FROM problem_rating", con)
df_prate.set_index(['contestid', 'problemid'], inplace=True)

df_tags = pd.read_sql("SELECT * FROM tags", con)
df_tags.set_index(['contestid', 'problemid'], inplace=True)

df_smooth.reset_index(inplace=True)
df_smooth.set_index(['handle'], inplace=True)
df_smooth.drop('contestname', axis=1, inplace=True)
df_smooth.drop('time', axis=1, inplace=True)

In [13]:
user_dict = defaultdict(list)
keys = [k.split(',') for k in keys]
for k in keys:
    user_dict[ k[0] ].append(k)

In [61]:
present_handles = set(df_smooth.index)

In [71]:
from os.path import exists
cnt = 0
#user = '-----'

def getTraining(user):
    #filename = 'train_rnn.csv'
    filename = 'rnn_train/%s.csv' % user
    print filename
    trainlist = []
    ur = df_smooth.loc[user, :]
    if len(ur.shape) == 1:
        print "     Not enough contests for user", ur.shape
        return
    #ur.drop(['contestid', 'rank'], inplace=True, axis=1)
    ur.is_copy = False
    ur.reset_index(inplace=True)
    
    
    for k in user_dict[user]:
        q = """
        SELECT * FROM submissions
            WHERE
                handle = '%s'
                AND
                contestid = '%s'
                AND
                problemid = '%s'
        """ % (k[0], k[1], k[2])

        df = pd.read_sql(q, con)
        df.is_copy = False

        ex = dict()
        
        # generic problem info
        ex['points'] = df.points[0]
        ex['starttimeseconds'] = min(df.starttimeseconds)
        ex['stoptimeseconds'] = max(df.starttimeseconds)
        
        # user rating info ----------------------------------
        # find closest next contest
        # if there is no next contest,then skip this entry
        idx = ur.ratingupdatetimeseconds >= ex['stoptimeseconds']
        if not np.any(idx):
            continue
        tur = ur.loc[idx]
        tur.is_copy = False
        idx = tur.ratingupdatetimeseconds == min(tur.ratingupdatetimeseconds)
        tur = tur.loc[idx].to_dict(orient='records')[0]
        ex.update(tur)


        # verdicts
        vcnt = df.verdict.value_counts()
        vdict = vcnt.to_dict()
        ex.update(vdict)

        # participant type
        pcnt = df.participanttype.value_counts()
        pdict = pcnt.to_dict()
        for t in pdict.iterkeys():
            ex[t] = 1

        # language
        lcnt = df.language.value_counts()
        ldict = lcnt.to_dict()
        ex.update(ldict)

        # problem rating
        if (k[1], k[2]) in df_prate.index:
            ex['problem_rating'] = df_prate.loc[str(k[1]),str(k[2])].values[0]
        else:
            ex['problem_rating'] = -1

        # time to solves
        solvetime = df.loc[df.verdict=='OK', 'starttimeseconds']
        if len(solvetime) > 0:
            ex['solvetimeseconds'] = min(solvetime)
        else:
            ex['solvetimeseconds'] = -1

        trainlist.append(ex)

    df_train = pd.DataFrame.from_dict(trainlist)
    for t in all_tags:
        if t not in df_train.columns:
            df_train[t] = np.nan
    for t in all_verdicts:
        if t not in df_train.columns:
            df_train[t] = np.nan
    for t in all_participanttypes:
        if t not in df_train.columns:
            df_train[t] = np.nan
    for t in all_language:
        if t not in df_train.columns:
            df_train[t] = np.nan
            
    df_train.to_csv(filename, mode='w', index=False, header=True)


In [74]:
import generate_features_RNN as gfr

Using TensorFlow backend.


In [77]:
binvars = gfr.get_categorical_variables([
    'all_participanttypes',
    'all_tags',
    'all_language'
])

In [114]:
reload(gfr)
lastidx = 0 
user = 'tourist'
user_rating = df_smooth.loc[user, :]
gfr.getTraining(user, user_rating, df_prate, user_dict[user], binvars, con)

rnn_train/tourist.csv


In [115]:
test = pd.read_csv('rnn_train/tourist.csv')

In [None]:
lastidx = 0
for i, user in enumerate(all_handles[lastidx:]):
    if user in present_handles:
        print lastidx + i, user
        getTraining(user)