# Requests, moving files from flat -> postgreSQL

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import requests
from sqlalchemy import *
from sqlalchemy_utils import database_exists, create_database
import psycopg2
from time import sleep
import re
%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 5.0)
plt.rcParams['figure.facecolor'] = 'white'

# Which handles are missing?

In [73]:
## get handles
#with open('all_handles.txt') as f:
#    allHandles = [line.strip() for line in f.readlines()]

df_tmp = pd.read_csv('rating_histories.csv', engine='c')
handles_have = set(df_tmp.handle.values)

In [9]:
handles_need = set([])
with open('all_submissions.tsv') as f:
    for line in f.readlines():
        handles_need.add(line.split('\t')[0])

In [74]:
print len(handles_have)
print len(handles_need)

58873
56697


In [75]:
for i in handles_need:
    print i
    break

spiderg


In [72]:
missing = handles_need.difference(handles_have)
print len(missing)
with open('missing_handles.txt', 'w') as f:
    for m in missing:
        f.write(m + '\n')

660


In [5]:
verdicts = set([])
with open('all_submissions.tsv') as f:
    print f.readline()
    for line in f.readlines():
        verdicts.add(line.split('\t')[-1])

author	contestID	id	language	memoryBytes	participantType	passedTestCount	points	problem_index	problem_name	problem_tags	relativeTimeSeconds	startTimeSeconds	testset	timeMilliseconds	verdict



In [6]:
verdicts

{'CHALLENGED\n',
 'COMPILATION_ERROR\n',
 'CRASHED\n',
 'FAILED\n',
 'IDLENESS_LIMIT_EXCEEDED\n',
 'MEMORY_LIMIT_EXCEEDED\n',
 'OK\n',
 'PARTIAL\n',
 'PRESENTATION_ERROR\n',
 'REJECTED\n',
 'RUNTIME_ERROR\n',
 'SKIPPED\n',
 'TESTING\n',
 'TIME_LIMIT_EXCEEDED\n',
 'WRONG_ANSWER\n'}

# API calls for hacks, insert into SQL database

In [4]:
dbname = 'codeforces'
username = 'Joy'

In [5]:
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url

## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

postgres://Joy@localhost/codeforces
True


## Contest information
get contests, only need to run once

In [6]:
# create table with schema
metadata = MetaData()
#index	creationTimeSeconds	id	verdict	ghost	defender	hacker	contestID	problemID	problemName
contests = Table('contests', metadata,
    Column('id', Integer, primary_key=True),
    Column('durationSeconds', Integer, nullable=False),
    Column('relativeTimeSeconds', Integer),
    Column('startTimeSeconds', Integer),
    Column('frozen', Boolean),
    Column('name', String),
    Column('type', String),
    Column('phase', String)
)
contests.drop(engine, checkfirst=True)
contests.create(engine)

url = 'http://codeforces.com/api/contest.list?gym=false'
r = requests.get(url).json()['result']
df_contests = pd.DataFrame.from_dict(r)
#df_contests.to_csv('contests.tsv', sep='\t', index=False, header=True, encoding='utf')
df_contests.to_sql('contests', engine, if_exists='replace')

## Get information on hacks

In [24]:
def getContestHacks(contest):
    url = 'http://codeforces.com/api/contest.hacks?contestId=' + str(contest)
    print url
    maxtries = 5
    tries = 0
    while tries < maxtries:
        print "attempt", tries
        try:
            r = requests.get(url).json()
            if r['status'] == 'FAILED':
                print r['comment']
                return
            r = r['result']
            if len(r) > 0:
                return pd.DataFrame.from_dict(r)
            else:
                return
        except:
            print "error, attempt", tries
            tries += 1
            sleep(5)
    print "ERROR GETTING HACK INFO FOR CONTEST", contest

### Create table schema for hacks

In [29]:
# create table with schema
metadata = MetaData()
#index	creationTimeSeconds	id	verdict	ghost	defender	hacker	contestID	problemID	problemName
hacks = Table('hacks', metadata,
    Column('id', Integer, primary_key=True),
    Column('creationTimeSeconds', Integer, nullable=False),
    Column('verdict', String),
    Column('ghost', Boolean),
    Column('defender', String),
    Column('hacker', String),
    Column('contestID', String),
    Column('problemID', String),
    Column('problemName', String)
)
hacks.drop(engine, checkfirst=True)
hacks.create(engine)

### request information on hacks

In [None]:
contest_ids = df_contests.id
last_idx = 3

for i, cid in enumerate(contest_ids[last_idx:]):
    print last_idx + i, cid
    df_hack = getContestHacks(cid)
    
    if df_hack is None:
        continue
    
    # format resulting dataframe
    ghost = df_hack.defender.apply(lambda x: x['ghost'])
    defender = df_hack.defender.apply(lambda x: x['members'][0]['handle'])
    hacker = df_hack.hacker.apply(lambda x: x['members'][0]['handle'])
    contestID = df_hack.problem.apply(lambda x: x['contestId'])
    problemID = df_hack.problem.apply(lambda x: x['index'])
    problemName = df_hack.problem.apply(lambda x: x['name'])
    
    df_hack.drop(['defender', 'hacker', 'judgeProtocol', 'problem', 'test'], inplace=True, axis=1)
    df_hack['ghost'] = ghost
    df_hack['defender'] = defender
    df_hack['hacker'] = hacker
    df_hack['contestID'] = contestID
    df_hack['problemID'] = problemID
    df_hack['problemName'] = problemName
    
    print df_hack.head()
    print "writing to sql..."
    df_hack.to_sql('hacks', engine, if_exists='append', index=False)

3 802
http://codeforces.com/api/contest.hacks?contestId=802
attempt 0
4 811
http://codeforces.com/api/contest.hacks?contestId=811
attempt 0


## Test connection

In [3]:
# connect:
con = psycopg2.connect(database = dbname, user = username)
cur = con.cursor()

In [5]:
cur.execute("SELECT database, gid FROM pg_prepared_xacts;")
cur.fetchall()

[]

###  show all tables

In [None]:
#cur.execute("""SELECT datname from pg_database""")
#cur.execute("""SELECT * FROM pg_catalog.pg_tables""")
cur.execute("""
DROP TABLE IF EXISTS hack;
""")
con.close()

In [None]:
con = psycopg2.connect(database = dbname, user = username)
cur = con.cursor()
cur.execute("""
SELECT
    table_schema || '.' || table_name
FROM
    information_schema.tables
WHERE
    table_type = 'BASE TABLE'
AND
    table_schema NOT IN ('pg_catalog', 'information_schema');
""")
rows = cur.fetchall()
for r in rows:
    print r

In [43]:
# query:
sql_query = """
SELECT * FROM hack WHERE verdict='HACK_SUCCESSFUL';
"""
df_hack = pd.read_sql_query(sql_query,con)

In [44]:
df_hack

Unnamed: 0,index,creationTimeSeconds,id,verdict,ghost,defender,hacker,contestID,problemID,problemName
0,0,1496327179,325237,HACK_SUCCESSFUL,False,ashp20,Grut,812,A,Sagheer and Crossroads
1,2,1496327241,325239,HACK_SUCCESSFUL,False,BurningAss,Grut,812,A,Sagheer and Crossroads
2,4,1496327272,325241,HACK_SUCCESSFUL,False,posij118,alex_bucevschi,812,A,Sagheer and Crossroads
3,5,1496327292,325242,HACK_SUCCESSFUL,False,krishna_k,Voudy,812,A,Sagheer and Crossroads
4,6,1496327293,325243,HACK_SUCCESSFUL,False,mihaiI,parallelc,812,A,Sagheer and Crossroads
5,7,1496327298,325244,HACK_SUCCESSFUL,False,Plurm,Flavius,812,A,Sagheer and Crossroads
6,8,1496327320,325245,HACK_SUCCESSFUL,False,Anatoly16,Voudy,812,A,Sagheer and Crossroads
7,9,1496327335,325246,HACK_SUCCESSFUL,False,EDGsheryl,jerrylinew,812,A,Sagheer and Crossroads
8,10,1496327363,325247,HACK_SUCCESSFUL,False,legendjohn9999,Grut,812,A,Sagheer and Crossroads
9,11,1496327448,325248,HACK_SUCCESSFUL,False,vknifieferepred0bsc0rsv,Voudy,812,A,Sagheer and Crossroads


# Move current flat files into sql database

In [4]:
# connect to engine
dbname = 'codeforces'
username = 'Joy'

engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

## Schemas

In [5]:
metadata = MetaData()

### problem_info

In [6]:
# create table with schema
# problem_data.csv
# contestID,contestName,division,name,points,problemID,startTimeSeconds,tags,type
# 1,Codeforces Beta Round #1,12,Theatre Square,,A,1266580800,[math],PROGRAMMING
problem_info = Table('problem_info', metadata,
    Column('contestid', String, primary_key=True),
    Column('problemid', String, primary_key=True),
    Column('contestname', String),
    Column('division', String),
    Column('name', String),
    Column('points', Integer),
    Column('starttimeseconds', Integer),
    Column('type', String)
)
problem_info.drop(engine, checkfirst=True)
problem_info.create(engine)

In [8]:
df_pi = pd.read_csv('problem_data.csv', engine='c')
newcols = [x.lower() for x in df_pi.columns.values]
df_pi.columns = newcols

df_pi.drop('tags', axis = 1, inplace=True)
df_pi.to_sql('problem_info', engine, if_exists='replace', index=False)

### tags

In [10]:
# create table with schema
#problem_ratings.csv
#contestID,problemID,problemRating
#413,B,0
tags = Table('tags', metadata,
    Column('contestid', String),
    Column('problemid', String),
    Column('tag', String)
)
tags.drop(engine, checkfirst=True)
tags.create(engine)

#### Extract tags

In [11]:
df_pi = pd.read_csv('problem_data.csv', engine='c')
rows = []
def get_tags(x):
    tags = []
    row = x.tags
    row = re.sub('[\[\]{}]', '', row)
    row = row.split(', ')
    for i, t in enumerate(row):
        if i%3 == 1:
            t = t.replace("u'name': u'" , '')
            t = t.replace("'" , '')
            tags.append(t)
    cid = x.contestID
    pid = x.problemID
    for t in tags:
        rows.append({'contestid': cid,
                     'problemid': pid,
                    'tag': t})
_ = df_pi.apply(get_tags, axis=1)
df_tags = pd.DataFrame.from_dict(rows)

newcols = [x.lower() for x in df_tags.columns.values]
df_tags.columns = newcols

df_tags.to_sql('tags', engine, if_exists='replace', index=False)

### problem_rating

In [9]:
# create table with schema
#problem_ratings.csv
#contestID,problemID,problemRating
#413,B,0
metadata = MetaData()
problem_rating = Table('problem_rating', metadata,
    Column('contestid', String, primary_key=True),
    Column('problemid', String, primary_key=True),
    Column('problemrating', Integer)
)
problem_rating.drop(engine, checkfirst=True)
problem_rating.create(engine)

df_pr = pd.read_csv('problem_ratings.csv', engine='c')
newcols = [x.lower() for x in df_pr.columns.values]
df_pr.columns = newcols
df_pr.to_sql('problem_rating', engine, if_exists='replace', index=False)

### Rating history

In [3]:
## create table with schema
metadata = MetaData()
user_rating = Table('user_rating', metadata,
    Column('contestid', String, primary_key=True),
    Column('contestname', String, primary_key=True),
    Column('handle', String),
    Column('newrating', Integer),
    Column('oldrating', Integer),
    Column('rank', Integer),
    Column('ratingupdatetimeseconds', Integer)
)
user_rating.drop(engine, checkfirst=True)
user_rating.create(engine)

In [4]:
df_rating = pd.read_csv('rating_histories.csv', engine='c')
newcols = [x.lower() for x in df_rating.columns.values]
df_rating.columns = newcols

In [5]:
df_rating.to_sql('user_rating', engine, if_exists='replace', index=False)

### submissions

In [7]:
# create table with schema
#all_submissions.tsv 
#author	contestID	id	language	memoryBytes	participantType	passedTestCount	points	problem_index	problem_name	problem_tags	relativeTimeSeconds	startTimeSeconds	testset	timeMilliseconds	verdict
#tourist	799	27036616	GNU C++11	19763200	CONTESTANT	33	3500.0	G	Cut the pie	[]	7077	1494523977	TESTS	1294	OK
metadata = MetaData()
submissions = Table('submissions', metadata,
    Column('handle', String),
    Column('id', Integer),
    Column('language', String),
    Column('memoryBytes', Integer),
    Column('participantType', String),
    Column('passedTestCount', Integer),
    Column('points', Integer),
    Column('relativeTimeSeconds', Integer),
    Column('startTimeSeconds', Integer),
    Column('timeMilliseconds', Integer),
    Column('testset', String),
    Column('verdict', String),
    Column('contestID', String),
    Column('problemID', String),
    Column('problemRating', Integer)
)
submissions.drop(engine, checkfirst=True)
submissions.create(engine)

#rename columns
df_sub = pd.read_csv('all_submissions.tsv', sep='\t', engine='c', nrows=1)
df_sub.drop(['problem_tags', 'problem_name'], axis=1, inplace=True)
colnames = df_sub.columns.values
colnames[0] = 'handle'
colnames[8] = 'problemID'

# read file in batches
#end = 19307027
step = 1000
for df_sub in pd.read_csv('all_submissions.tsv', sep='\t', engine='c', chunksize=step):
    # drop columns that are unnecessary
    df_sub.drop(['problem_tags', 'problem_name'], axis=1, inplace=True)
    df_sub.columns = colnames
    df_sub.to_sql('submissions', engine, if_exists='append', index=False)

## store user handles

In [11]:
with open('all_handles.txt') as f:
    handles = [line.strip() for line in f.readlines()]
df_handles = pd.DataFrame.from_dict({'handle': handles})
df_handles.to_sql('handles', engine, if_exists='replace', index=False)