In [144]:
import pandas as pd
import tensorflow as tf
import numpy as np

# Reading in problem metrics
## **SKIP FOR NOW** Calculated learning, rating, etc. coefficients

In [51]:
df_coeffs = pd.read_csv('problem_coefficients.csv', engine='c')
df_coeffs.drop(axis=1, labels=['cluster'], inplace=True)
print df_coeffs.shape
df_coeffs.head()

(924, 7)


Unnamed: 0,contestID,problemID,problemRating,learnCoeff,rating50,intercept,slope
0,467,A,648,0.002789,-313.05411,1861.363321,-78.149581
1,581,A,662,0.005014,254.347106,1930.762154,-86.027476
2,509,A,671,0.006021,404.871567,1977.56364,-89.932727
3,231,A,699,0.002692,-501.37944,1835.180436,-70.058187
4,510,A,703,0.002494,-369.354427,2293.652154,-133.818262


## Problem ratings
These are more robust, whereas other values requrie min number of entries per problem

In [159]:
path = '/Users/Joy/Dropbox/Algorithms/codeforces/codeforces-api/ui/problem_ratings.csv'
df_contest = pd.read_csv(path, engine='c')

# Import User Submissions

In [219]:
df_sub = pd.read_csv('all_submissions.csv', engine='c', nrows=25000)
df_sub.rename(columns={'problem_index': 'problemID'}, inplace=True)
# drop some fields we won't need for now
df_sub.drop(axis=1, labels=['problem_name', 'id', 'memoryBytes',\
                            'relativeTimeSeconds', 'timeMilliseconds',\
                            'passedTestCount', 'problem_tags', \
                            'testset', 'language'], inplace=True)

# Merging problem metrics with user submissions
**Issues**
* there are many missing entries for coefficients. This is unlikely to be ever solved since we require a min number of entries for curve fitting. Can revisit once we figure out the best way to cope with missing data
* Many problems with no ratings because:
    * it was a gym contest. This means that CF doesn't provide a reliable API for who did and didn't solve the problem. We would need to write a scraper to do this for us.

In [220]:
df = df_sub.merge(df_contest, left_on=['contestID', 'problemID'], right_on=['contestID', 'problemID'], how='left')
#df_sub.merge(df_coeffs, left_on=['contestID', 'problemID'], right_on=['contestID', 'problemID'], how='left')

for each person, convert itmestamps to personal timeline:
* normalize by smalles time
* divide by 60 to get minutes (or hours??)

In [221]:
userMinutes = df.groupby('author').apply(lambda x: (x['startTimeSeconds'] - min(x['startTimeSeconds']))/3600)
df['userMinutes'] = userMinutes.reset_index(level=0, drop=True)

# for each user, sort by time
df = df.groupby('author', as_index=False).apply(pd.DataFrame.sort_values, 'userMinutes')
df.reset_index(level=0, drop=True, inplace=True)

df.head()

Unnamed: 0,author,contestID,participantType,points,problemID,startTimeSeconds,verdict,problemRating,userMinutes
24093,I_love_Tanya_Romanova,36,CONTESTANT,500.0,A,1287482744,OK,1304.0,0.0
24092,I_love_Tanya_Romanova,36,CONTESTANT,1000.0,B,1287484065,OK,1442.0,0.366944
24091,I_love_Tanya_Romanova,36,CONTESTANT,2000.0,D,1287486762,WRONG_ANSWER,2027.0,1.116111
24090,I_love_Tanya_Romanova,50,CONTESTANT,500.0,A,1292862607,OK,,1494.406389
24089,I_love_Tanya_Romanova,50,CONTESTANT,1000.0,B,1292862840,OK,,1494.471111


# Pseudo Events

We need two types of pseudo events:
* pre-Contest indicator
* rating increase indicator

The idea is that we can give the RNN a signal that a contest is coming up by creating a pseudo event

In [253]:
prev_contest = 0
cnt = 0
df.reset_index(level=0, drop=True, inplace=True)

prows = []
before_idx = []

for index, row in df.iterrows():
    if row['contestID'] != prev_contest and row['participantType'] == 'CONTESTANT':
        prow = {
            'author': row['author'],
            'contestID': row['contestID'],
            'participantType': 'CONTESTANT',
            'points': -1,
            'problemID': 'N',
            'startTimeSeconds': row['startTimeSeconds'],
            'verdict': 'START',
            'problemRating': -1,
            'userMinutes': row['userMinutes']
        }
        prev_contest = row['contestID']
        prows.append(prow)
        before_idx.append( max(0, index-1) )
        
#        before = df.loc[0:max(0, index-1)]
#        after = df.loc[index:]
#        
#        before = before.append(prow, ignore_index=True)
#        
#        print before.head()
#        print "-------------------------------------------------------------------------------"
#        print after.head()
#        print "new contest:", row['contestID']
#        
#        pd.concat([before, after])
#        break
#        
#        cnt+=1
#    if cnt > 10:
#        break

# Convert categorical variables to one-hot embedding
Can think about word2vec later for dense embedding later, if needed
https://www.tensorflow.org/tutorials/word2vec

In [255]:
participantType = pd.get_dummies(df.participantType)
verdict = pd.get_dummies(df.verdict)
df = df.join(participantType)
df = df.join(verdict)
df.shape

(25000, 28)

# drop extraneous columns

In [256]:
df.drop(axis=1, labels=['problemID', 'contestID', 'verdict', 'participantType'], inplace=True)

In [257]:
df.head()

Unnamed: 0,author,points,startTimeSeconds,problemRating,userMinutes,CONTESTANT,GYM,OUT_OF_COMPETITION,PRACTICE,VIRTUAL,...,IDLENESS_LIMIT_EXCEEDED,MEMORY_LIMIT_EXCEEDED,OK,PARTIAL,PRESENTATION_ERROR,REJECTED,RUNTIME_ERROR,SKIPPED,TIME_LIMIT_EXCEEDED,WRONG_ANSWER
0,I_love_Tanya_Romanova,500.0,1287482744,1304.0,0.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,I_love_Tanya_Romanova,1000.0,1287484065,1442.0,0.366944,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,I_love_Tanya_Romanova,2000.0,1287486762,2027.0,1.116111,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,I_love_Tanya_Romanova,500.0,1292862607,,1494.406389,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,I_love_Tanya_Romanova,1000.0,1292862840,,1494.471111,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


# Fix NAN values in problemRating

Since there's no way of getting ELO scores for some problems, we'll have to infer them somehow.

One naive way of doing this is simply by taking the problem rating ot be the average of ```x``` values that came before (or after, if it's the first in the series). Unfortunately this takes a long time in Python (see below).

For now, just set these scores to 0. They are all gym questions anyway and are easily distinguishable from other types of questions.

In [260]:
df['problemRating'] = df['problemRating'].fillna(value=0, inplace=False)

df.head()

Unnamed: 0,author,points,startTimeSeconds,problemRating,userMinutes,CONTESTANT,GYM,OUT_OF_COMPETITION,PRACTICE,VIRTUAL,...,IDLENESS_LIMIT_EXCEEDED,MEMORY_LIMIT_EXCEEDED,OK,PARTIAL,PRESENTATION_ERROR,REJECTED,RUNTIME_ERROR,SKIPPED,TIME_LIMIT_EXCEEDED,WRONG_ANSWER
0,I_love_Tanya_Romanova,500.0,1287482744,1304.0,0.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,I_love_Tanya_Romanova,1000.0,1287484065,1442.0,0.366944,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,I_love_Tanya_Romanova,2000.0,1287486762,2027.0,1.116111,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,I_love_Tanya_Romanova,500.0,1292862607,0.0,1494.406389,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,I_love_Tanya_Romanova,1000.0,1292862840,0.0,1494.471111,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


# Export

In [262]:
df.to_csv('rnn_input_test.csv', index=False)

# Tags
We will add 36 new dimensions if tags are used. This is definitely a possibility

In [277]:
df_problems = pd.read_csv('problem_data.csv', engine='c')
tags = set()
for line in df_problems.tags.values:
    line = line.replace('[', '')
    line = line.replace(']', '')
    line = line.split(', ')
    for item in line:
        tags.add(item)
print len(tags)
print tags

36
set(['shortest paths', '', '2-sat', 'schedules', 'number theory', 'constructive algorithms', 'chinese remainder theorem', 'graphs', 'probabilities', 'matrices', 'dsu', 'dfs and similar', 'graph matchings', 'string suffix structures', 'math', 'meet-in-the-middle', 'divide and conquer', 'two pointers', 'trees', 'data structures', 'greedy', 'sortings', 'flows', 'expression parsing', 'dp', 'hashing', 'fft', 'bitmasks', 'geometry', 'combinatorics', 'implementation', 'brute force', 'games', 'ternary search', 'binary search', 'strings'])


In [195]:
#steps_back = 5
#g = df.groupby('author')
#for usr, val in g:
#    ind = np.where(~np.isnan(val.problemRating))[0]
#    start = 0
#    for i in ind:
#        # invariant: NaNs behind start are all filled in
#        if (start < i):
#            l = max(0, start - steps_back)
#            r = min(i+1, len(val))
#            
#            if l == start:  # we don't have any earlier entries to use
#                replacement = 0
#                denom = 0
#                for j in range(steps_back):
#                    if not np.isnan(val['problemRating'][i+j]):
#                        replacement += val['problemRating'][i+j]
#                        denom += 1
#                replacement = (replacement+.0) / denom
#            else:
#                replacement = np.mean(val['problemRating'][l:start])
#            val.iloc[start:i]['problemRating'] = replacement
#            
#            #print start, i
#            #print val[l:r]
#            #break
#        start = i+1