In [1]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import scipy
import psycopg2
import tensorflow as ts
from collections import defaultdict

con = psycopg2.connect(database='codeforces', user='Joy')
cur = con.cursor()

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 10.0)
plt.rcParams['figure.facecolor'] = 'white'

  from pandas.core import datetools


# create Y values

In [4]:
# note this is 4x faster than getting it from sql
df_smooth = pd.read_csv('user_ratings_smoothed.csv', engine = 'c')

In [24]:
# calculate difference
gusr = df_smooth.groupby('handle')
stack = []

for usr, dfu in gusr:
    dfu.is_copy=False
    dfu.sort_values('ratingupdatetimeseconds', inplace=True)
    stack.append(dfu)

In [26]:
df_smooth = pd.concat(stack)
for month in range(1, 6):
    curr = df_smooth["smoothed_%dmonths" % month]
    prev = np.roll(curr, 1)

    delta = curr - prev
    df_smooth["delta_smoothed_%dmonths" % month] = delta

In [28]:
df_smooth.head(50)

Unnamed: 0,time,contestid,contestname,handle,newrating,oldrating,rank,ratingupdatetimeseconds,smoothed_1months,smoothed_2months,smoothed_3months,smoothed_4months,smoothed_5months,delta_smoothed_1months,delta_smoothed_2months,delta_smoothed_3months,delta_smoothed_4months,delta_smoothed_5months
0,2010-12-17 18:00:00,49,Codeforces Beta Round #46 (Div. 2),-----,1370,1500,344,1292608800,1370.0,1370.0,1370.0,1370.0,1370.0,-477.0,-477.0,-477.0,-477.0,-477.0
1,2011-06-16 17:00:00,90,Codeforces Beta Round #74 (Div. 2 Only),-----,1311,1370,653,1308243600,1311.0,1311.0,1311.0,1311.0,1311.0,-59.0,-59.0,-59.0,-59.0,-59.0
2,2011-08-23 17:00:00,108,Codeforces Beta Round #83 (Div. 2 Only),-----,1273,1311,680,1314118800,1273.0,1273.0,1292.0,1292.0,1292.0,-38.0,-38.0,-19.0,-19.0,-19.0
3,2011-09-08 17:00:00,114,Codeforces Beta Round #86 (Div. 2 Only),-----,1230,1273,810,1315501200,1251.5,1251.5,1271.333333,1271.333333,1271.333333,-21.5,-21.5,-20.666667,-20.666667,-20.666667
4,2011-10-07 17:00:00,118,Codeforces Beta Round #89 (Div. 2),-----,1198,1230,917,1318006800,1214.0,1233.666667,1233.666667,1253.0,1253.0,-37.5,-17.833333,-37.666667,-18.333333,-18.333333
5,2011-10-18 11:30:00,120,"School Regional Team Contest, Saratov, 2011",-----,1172,1198,533,1318937400,1185.0,1218.25,1218.25,1218.25,1236.8,-29.0,-15.416667,-15.416667,-34.75,-16.2
6,2011-10-27 17:00:00,122,Codeforces Beta Round #91 (Div. 2 Only),-----,1130,1172,1075,1319734800,1166.666667,1182.5,1200.6,1200.6,1219.0,-18.333333,-35.75,-17.65,-17.65,-17.8
7,2011-11-09 19:00:00,127,Codeforces Beta Round #93 (Div. 2 Only),-----,1234,1130,521,1320865200,1178.666667,1183.5,1206.166667,1206.166667,1221.142857,12.0,1.0,5.566667,5.566667,2.142857
8,2011-11-25 17:00:00,131,Codeforces Beta Round #95 (Div. 2),-----,1293,1234,805,1322240400,1219.0,1205.4,1209.5,1218.571429,1218.571429,40.333333,21.9,3.333333,12.404762,-2.571429
9,2013-04-07 17:30:00,294,Codeforces Round #178 (Div. 2),-----,1490,1293,286,1365355800,1490.0,1490.0,1490.0,1490.0,1490.0,271.0,284.6,280.5,271.428571,271.428571


## output to sql and csv

In [29]:
df_smooth.to_csv('user_ratings_smoothed.csv', index=False, header=True)

In [7]:
from sqlalchemy import create_engine
engine = create_engine('postgres://%s@localhost/%s'%('Joy', 'codeforces'))

df_smooth.to_sql('user_rating_smooth', engine, if_exists='replace')

# Features
 **problem type**
 * contest
 * virtual
 * etc
 
**problem info**
 * tags
 * rating
 * point value
 
**submission info**
 * number of wrong answers
 * number of TLE
 * number of compile errors
 * time between first submission and solve
 * relative time to competition
 
**user info**
 * current smooth rating
 * volatility?
 * lag can be estimated from user rating and smoothed rating, but do we want it??

In [None]:
q = """
SELECT DISTINCT handle, contestid, problemid FROM submissions LIMIT 10;
"""
cur.execte(q)
cur.fetchall()

In [6]:
cur.execute("""SELECT handle FROM handles;""")
handles = [x[0] for x in cur.fetchall()]

In [9]:
for h in handles:
    q = """
    SELECT * FROM submissions
        WHERE handle='%s';
    """ % h
    df = pd.read_sql(q, con)
    g_cidpid = df.groupby(['contestid', 'problemid'])
    for key, dfp in g_cidpid:
        print dfp
        break
    break

       handle  submissionid language  memoryBytes participanttype  \
1112  tourist        545140   Delphi       921600        PRACTICE   

      passedtestcount  points  relativetimeseconds  starttimeseconds  \
1112               20       0             44062135        1310642935   

      timemilliseconds testset verdict contestid problemid  
1112                30   TESTS      OK         1         A  


In [3]:
ts.contrib.keras

<module 'tensorflow.contrib.keras' from '/usr/local/lib/python2.7/site-packages/tensorflow/contrib/keras/__init__.pyc'>