#  Codechef Competitive Programming Analysis

In [1]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path('.') / '../data/codechef'

## Datasets

In [2]:
questions_df  = pd.read_csv(DATA_DIR / 'questions.csv', index_col=[0])
solutions_df = pd.read_csv(DATA_DIR / 'solutions.csv', index_col=[1])
codes_df_1 = pd.read_csv(DATA_DIR / 'program_codes' / 'first.csv', index_col=[0])
codes_df_2 = pd.read_csv(DATA_DIR / 'program_codes' / 'second.csv', index_col=[0])
codes_df_3 = pd.read_csv(DATA_DIR / 'program_codes' / 'third.csv', index_col=[0])

In [3]:
questions_df.head()

Unnamed: 0_level_0,Title,link,level,statement,Author,Tester,Editorial,Tags,Date Added,Time Limit,Source Limit,Languages
QCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SEQUENCE,Integer Sequences,/problems/SEQUENCE,medium,\nAll submissions for this problem are availab...,friggstad,pieguy,http://discuss.codechef.com/problems/SEQUENCE,"['friggstad', 'cook07', 'medium']",30-01-2011</td> <,4 sec</td> <,50000 Bytes</td> <,"<td>ADA, ASM, BASH, BF, C, C99 str..."
CHEFTEAM,Chef team,/problems/CHEFTEAM,easy,\nAll submissions for this problem are availab...,anton_lunyov,friggstad,http://discuss.codechef.com/problems/CHEFTEAM,"['anton_lunyov', 'cook06', 'easy']",14-01-2011</td> <,1 sec</td> <,50000 Bytes</td> <,"<td>ADA, ASM, BASH, BF, C, C99 str..."
SEATR,Sereja and Tree,/problems/SEATR,medium,\nAll submissions for this problem are availab...,sereja,iscsi,http://discuss.codechef.com/problems/SEATR,"['mathematics', 'sereja', 'dynamic-prog', 'mem...",6-11-2015</td> <,10 sec</td> <,50000 Bytes</td> <,"<td>ADA, ASM, BASH, BF, C, C99 str..."
FLOW017,Second Largest,/problems/FLOW017,beginner,\nAll submissions for this problem are availab...,vicky002,,,['vicky002'],27-04-2015</td> <,1 sec</td> <,50000 Bytes</td> <,"<td>ADA, ASM, BASH, BF, C, C99 str..."
CFRTEST,Devu and friendship testing,/problems/CFRTEST,beginner,\nAll submissions for this problem are availab...,admin2,,http://discuss.codechef.com/problems/CFRTEST,"['admin2', 'cook58', 'cakewalk']",11-05-2015</td> <,1 sec</td> <,50000 Bytes</td> <,"<td>ADA, ASM, BASH, BF, C, C99 str..."


In [4]:
solutions_df.loc[solutions_df.index.isin(codes_df_1.index), 'LineCount'] = codes_df_1['Solutions'].astype(str).apply(lambda code: code.count('\n'))
solutions_df.loc[solutions_df.index.isin(codes_df_2.index), 'LineCount'] = codes_df_2['Solutions'].astype(str).apply(lambda code: code.count('\n'))
solutions_df.loc[solutions_df.index.isin(codes_df_3.index), 'LineCount'] = codes_df_3['Solutions'].astype(str).apply(lambda code: code.count('\n'))

In [5]:
solutions_df.head()

Unnamed: 0_level_0,QCode,timeago,UserID,Status,TimeTaken,MemTaken,Language,SolutionUrl,LineCount
SolutionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
S11986970,A1,07:58 PM 31/10/16,/users/ismail_2311,accepted,1.43,2M,C,/viewplaintext/11986970,51.0
S11990433,A1,07:51 PM 01/11/16,/users/mohit_15,wrong answer,0.00,2M,C,/viewplaintext/11990433,83.0
S11990440,A1,07:54 PM 01/11/16,/users/mohit_15,accepted,0.00,2M,C,/viewplaintext/11990440,83.0
S11995888,A1,12:17 PM 03/11/16,/users/saloni1907,compilation error,-,-,C++ 4.3.2,/viewplaintext/11995888,37.0
S11995889,A1,12:18 PM 03/11/16,/users/saloni1907,accepted,0.00,2.7M,C++ 4.3.2,/viewplaintext/11995889,40.0


## What tags that the questions appear with are users most interested in, i.e. recursion, DP, etc.? 

In [6]:
questions_df['SubmissionCount'] = solutions_df.groupby('QCode')['QCode'].count()
questions_df_with_count = questions_df.dropna(subset=['SubmissionCount']).copy()
questions_df_with_count['Tags'] = questions_df_with_count['Tags'].apply(lambda tags: eval(tags))

In [7]:
tag_count = questions_df_with_count.explode('Tags').groupby('Tags')['SubmissionCount'].sum().sort_values(ascending=False)[:10]

In [8]:
tag_count.head(10)

Tags
easy           197163.0
admin          186587.0
cakewalk       139867.0
ad-hoc          75895.0
array           71909.0
simple          56443.0
simple-math     48350.0
dp              42860.0
furko           38273.0
medium          31999.0
Name: SubmissionCount, dtype: float64

## Which languages are users most comfortable with? 

In [9]:
language_count = solutions_df.reset_index()[["UserID", "Language"]].drop_duplicates().groupby("Language")['UserID'].count().sort_values(ascending=False)[:10]
language_count.head(10)

Language
C              24385
C++ 4.3.2      15341
C++ 4.9.2      15076
JAVA           11176
C++14           7837
C++ 4.8.1       4592
PYTH            3936
C++11           1821
C++ 4.0.0-8     1738
PYTH 3.1.2      1431
Name: UserID, dtype: int64

## Which languages are likely to have more compilation errors before successful submission?

In [10]:
valid_state = ['accepted', 'wrong answer', 'internal_error', 'running..', 'compiling..', 'running judge..']
solutions_df_valid_state = solutions_df.dropna(subset=['Status']).reset_index()
invalid_state_count = solutions_df_valid_state[~solutions_df_valid_state['Status'].isin(valid_state)].groupby(['Status', 'Language'])['SolutionID'].count().reset_index()
invalid_state_count.head()

Unnamed: 0,Status,Language,SolutionID
0,compilation error,ADA,1578
1,compilation error,ASM,6
2,compilation error,C,14544
3,compilation error,C#,407
4,compilation error,C++,200


## Among the successful submissions, how many lines are used per language? What about space and time complexity?

In [11]:
solutions_df_successful = solutions_df[solutions_df['Status'] == 'accepted'].join(questions_df['level'], on='QCode')
solutions_df_successful['TimeTaken'] = solutions_df_successful['TimeTaken'].astype(float)
solutions_df_successful['MemTaken'] = solutions_df_successful['MemTaken'].apply(lambda mem: mem.replace('M', '')).astype(float)
solutions_df_successful.groupby(['level', 'Language']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,TimeTaken,MemTaken,LineCount
level,Language,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
beginner,ADA,0.070000,5.550000,33.500000
beginner,ASM,0.188571,11.742857,245.857143
beginner,BASH,1.046250,5.312500,10.750000
beginner,C,0.110575,4.915943,38.164942
beginner,C#,1.006133,37.133813,42.620504
...,...,...,...,...
medium,PYPY,0.215000,46.500000,23.500000
medium,PYTH,0.778036,7.541071,45.696429
medium,PYTH 3.1.2,0.255333,8.573333,33.466667
medium,PYTH 3.4,8.940000,8.700000,30.000000


## What's the acceptance rate per question difficulty?

In [12]:
solutions_df_level = solutions_df.join(questions_df['level'], on='QCode')
solutions_df_level['IsAccepted'] = solutions_df_level['Status'] == 'accepted'
solutions_df_level_acceptance = solutions_df_level.groupby(['level', 'IsAccepted'])['UserID'].count().reset_index()
solutions_df_level_acceptance_true = solutions_df_level_acceptance[solutions_df_level_acceptance['IsAccepted'] == True].set_index('level')['UserID'] 
solutions_df_level_acceptance_false = solutions_df_level_acceptance[solutions_df_level_acceptance['IsAccepted'] == False].set_index('level')['UserID'] 
rate = solutions_df_level_acceptance_true / (solutions_df_level_acceptance_true + solutions_df_level_acceptance_false)
rate.sort_values(ascending=False)

level
easy         0.269053
beginner     0.215812
hard         0.174256
medium       0.133455
challenge    0.014288
Name: UserID, dtype: float64

In [13]:
questions_df.to_csv('questions.csv')

In [14]:
solutions_df.to_csv('solutions.csv')