In [96]:
import pandas as pd, numpy as np
import json

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

In [82]:
folder = "data/predict_missing_grade/"
# Read training data
with open(folder + 'training.json') as train:
    n = int(train.readlines(1)[0])
    train_json = []
    for i in range(n):
        train_json.append(json.loads(train.readlines(1)[0].strip()))
        
train_all = pd.DataFrame(train_json)
len(train_all), train_all.head()

(79465,
    Physics  Chemistry  PhysicalEducation  English  Mathematics  serial  \
 0      8.0        7.0                3.0        4            6  195490   
 1      1.0        1.0                1.0        3            3  190869   
 2      1.0        2.0                2.0        1            2    3111   
 3      8.0        7.0                6.0        7            7   47738   
 4      1.0        1.0                1.0        3            2   85520   
 
    Biology  Accountancy  BusinessStudies  Economics  ComputerScience  
 0      NaN          NaN              NaN        NaN              NaN  
 1      NaN          NaN              NaN        NaN              NaN  
 2      NaN          NaN              NaN        NaN              NaN  
 3      NaN          NaN              NaN        NaN              NaN  
 4      NaN          NaN              NaN        NaN              NaN  )

In [83]:
train_filled = train_all.fillna(train_all.mean()).set_index('serial')
train_filled.head()

Unnamed: 0_level_0,Physics,Chemistry,PhysicalEducation,English,Mathematics,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
195490,8.0,7.0,3.0,4,6,3.599004,2.613672,2.893751,2.469142,3.87635
190869,1.0,1.0,1.0,3,3,3.599004,2.613672,2.893751,2.469142,3.87635
3111,1.0,2.0,2.0,1,2,3.599004,2.613672,2.893751,2.469142,3.87635
47738,8.0,7.0,6.0,7,7,3.599004,2.613672,2.893751,2.469142,3.87635
85520,1.0,1.0,1.0,3,2,3.599004,2.613672,2.893751,2.469142,3.87635


In [84]:
train_subj = ['Physics', 'Chemistry', 'PhysicalEducation', 'English', 'Biology', 'Accountancy', 'BusinessStudies', 'Economics', 'ComputerScience']
X_train = train_filled[train_subj]
y_train = train_filled['Mathematics']
X_train.head(), y_train.head()

(        Physics  Chemistry  PhysicalEducation  English   Biology  Accountancy  \
 serial                                                                          
 195490      8.0        7.0                3.0        4  3.599004     2.613672   
 190869      1.0        1.0                1.0        3  3.599004     2.613672   
 3111        1.0        2.0                2.0        1  3.599004     2.613672   
 47738       8.0        7.0                6.0        7  3.599004     2.613672   
 85520       1.0        1.0                1.0        3  3.599004     2.613672   
 
         BusinessStudies  Economics  ComputerScience  
 serial                                               
 195490         2.893751   2.469142          3.87635  
 190869         2.893751   2.469142          3.87635  
 3111           2.893751   2.469142          3.87635  
 47738          2.893751   2.469142          3.87635  
 85520          2.893751   2.469142          3.87635  , serial
 195490    6
 190869    3
 3111

In [85]:
# Read test cases data from file
with open(folder + 'sample-test.in.json') as test:
    n = int(test.readlines(1)[0])
    test_json = []
    for i in range(n):
        test_json.append(json.loads(test.readlines(1)[0].strip()))
        
test_all = pd.DataFrame(test_json)
len(test_all), test_all.head()

(69530,
    Physics  Chemistry  Biology  English  serial  ComputerScience  Accountancy  \
 0      2.0        2.0      1.0        1  221375              NaN          NaN   
 1      3.0        3.0      NaN        4  150188              4.0          NaN   
 2      NaN        NaN      NaN        1   12154              NaN          1.0   
 3      2.0        2.0      NaN        1   31442              2.0          NaN   
 4      NaN        NaN      NaN        2  137253              NaN          5.0   
 
    BusinessStudies  Economics  PhysicalEducation  
 0              NaN        NaN                NaN  
 1              NaN        NaN                NaN  
 2              1.0        3.0                NaN  
 3              NaN        NaN                NaN  
 4              4.0        3.0                NaN  )

In [None]:
# Read test cases data from STDIN
n = int(input())
test_json = []
for i in range(n):
    test_json.append(json.loads(input()))
        
test_all = pd.DataFrame(test_json)
len(test_all), test_all.head()

In [87]:
test_filled = test_all.fillna(train_all.mean())
X_test = test_filled[train_subj]
X_test.head()

Unnamed: 0,Physics,Chemistry,PhysicalEducation,English,Biology,Accountancy,BusinessStudies,Economics,ComputerScience
0,2.0,2.0,3.186032,1,1.0,2.613672,2.893751,2.469142,3.87635
1,3.0,3.0,3.186032,4,3.599004,2.613672,2.893751,2.469142,4.0
2,3.992593,3.983971,3.186032,1,3.599004,1.0,1.0,3.0,3.87635
3,2.0,2.0,3.186032,1,3.599004,2.613672,2.893751,2.469142,2.0
4,3.992593,3.983971,3.186032,2,3.599004,5.0,4.0,3.0,3.87635


In [97]:
%%time
clf = Pipeline([
    ('std', StandardScaler()),
    ('clf', DecisionTreeRegressor())
])

clf.fit(X_train, y_train)
np.mean(np.abs(np.round(clf.predict(X_train)) - y_train))

CPU times: user 599 ms, sys: 13.2 ms, total: 612 ms
Wall time: 178 ms


1.0214056502862896

In [90]:
for x in np.round(clf.predict(X_test)):
    print(int(x))

array([3., 4., 3., ..., 4., 5., 5.])