In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
tcn = ['question', 'answerA', 'answerB', 'answerC', 'answerD']

def read(path):
    td = pd.read_csv(path, '\t')
    
    td['id'] = td['id'].astype(np.uint32)
    td.set_index(['id'], inplace=True)
            
    if 'correctAnswer' in td:
        correctAnswers = td['correctAnswer']
        del td['correctAnswer']
        td.insert(0, 'correctAnswer', correctAnswers.astype('category'))
    
    return td

def clean(td):
    from nltk import word_tokenize as tokenize_words

    from nltk.corpus import stopwords
    stopwords = frozenset(stopwords.words('english'))

    from string import punctuation as punct
    punct = frozenset(punct)

    def tokenize(text):
        words = []
        for token in tokenize_words(text):
            token = token.lower()
            if token in stopwords:
                continue
            if token[-1] in punct:
                continue        
            words.append(token)
        return words
        
    for cn in tcn:
        td[cn] = td[cn].map(tokenize)

    return td

td = read('data/training_set.tsv')
td = clean(td)

td

Unnamed: 0_level_0,correctAnswer,question,answerA,answerB,answerC,answerD
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100001,C,"[athletes, begin, exercise, heart, rates, resp...","[tissue, level]","[organ, level]","[system, level]","[cellular, level]"
100002,C,"[example, describes, learned, behavior, dog]","[smelling, air, odors]","[barking, disturbed]","[sitting, command]","[digging, soil]"
100003,D,"[two, nuclei, combined, one, nucleus, slight, ...",[conversion],[reaction],[fission],[fusion]
100004,B,"[distinction, epidemic, pandemic]","[symptoms, disease]","[geographical, area, affected]","[species, organisms, infected]","[season, disease, spreads]"
100005,B,"[way, orbit, comet, different, orbit, earth]","[orbit, earth, less, circular, orbit, comet]","[orbit, comet, elliptical, orbit, earth]","[orbital, period, earth, much, longer, orbital...","[orbital, period, comet, predictable, orbital,..."
100006,B,"[teacher, builds, model, hydrogen, atom, red, ...","[number, particles]","[relative, mass, particles]","[types, particles, present]","[charges, particles, present]"
100007,A,"[substance, student, apply, skin, gets, splash...",[water],[vinegar],[salt],[formaldehyde]
100008,A,"[main, source, energy, water, cycle]",[sun],"[fossil, fuels]",[clouds],[ocean]
100009,D,"[greatest, effect, aiding, movement, blood, hu...",[tension],[friction],[density],[gravity]
100010,C,"[time, non-volcanic, mountains, form, due, int...","[oceanic, plates, colliding, oceanic, plates]","[oceanic, plates, separating, oceanic, plates]","[continental, plates, colliding, continental, ...","[continental, plates, separating, continental,..."


In [3]:
vs = 300

def build_feature_extractor(texts):
    from gensim.models.doc2vec import Doc2Vec, TaggedDocument
    from multiprocessing import cpu_count
    
    model = Doc2Vec(
        [TaggedDocument(t, [i]) for i, t in enumerate(texts)],
        workers=cpu_count(),

        size=vs,
    )
    
    def extract_features(text):
        return model.infer_vector(text).astype(np.float32)
    
    return extract_features


extract_features = build_feature_extractor(t for cn in tcn for t in td[cn].values)

tfcn_for = {cn: ['%s_feature_%d' % (cn, i) for i in range(vs)] for cn in tcn}
tfcn = [fcn for cn in tcn for fcn in tfcn_for[cn]]
vd = pd.DataFrame(index=td.index, columns=['correctAnswer'] + tfcn)
vd['correctAnswer'] = td['correctAnswer']
vd[tfcn] = vd[tfcn].astype(np.float32)
vd = vd.to_dense()

for i in tqdm(td.index):
    for cn in tcn:
        vd.loc[i, tfcn_for[cn]] = extract_features(td[cn][i])
    
vd



Unnamed: 0_level_0,correctAnswer,question_feature_0,question_feature_1,question_feature_2,question_feature_3,question_feature_4,question_feature_5,question_feature_6,question_feature_7,question_feature_8,...,answerD_feature_290,answerD_feature_291,answerD_feature_292,answerD_feature_293,answerD_feature_294,answerD_feature_295,answerD_feature_296,answerD_feature_297,answerD_feature_298,answerD_feature_299
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,C,0.000763,0.001359,0.000566,0.002174,0.001830,-0.003094,0.002476,0.001588,-0.001686,...,0.000325,0.000967,0.000787,-0.000922,0.003456,0.000074,-0.001256,-0.002467,0.001302,-0.004355
100002,C,0.001619,-0.001005,0.002617,-0.001293,0.003531,-0.004172,0.000856,-0.000819,-0.000112,...,0.001915,0.001587,-0.003647,-0.000399,0.015365,0.008554,-0.008736,-0.011325,0.014262,-0.012788
100003,D,0.005422,0.001741,-0.000166,0.000411,0.006842,-0.006424,0.009349,0.003528,-0.003171,...,0.001929,0.005684,-0.006711,0.002336,0.018111,0.013455,-0.012910,-0.017683,0.020024,-0.014608
100004,B,-0.011607,-0.005494,0.000082,-0.004888,-0.010634,0.009740,-0.014794,-0.005652,0.003185,...,0.000205,-0.002328,0.001136,0.001064,-0.009711,-0.006663,0.005646,0.007258,-0.011161,0.007346
100005,B,-0.006478,-0.001189,-0.001648,-0.000657,-0.004810,0.008213,-0.007334,-0.003448,0.001320,...,-0.001302,-0.005400,0.003157,0.000934,-0.009396,-0.004799,0.005506,0.009569,-0.010810,0.005808
100006,B,-0.007345,-0.003251,0.000316,-0.001481,-0.006091,0.009270,-0.006762,-0.001556,0.000309,...,-0.001125,0.001589,-0.003281,-0.000388,0.007701,0.004074,-0.003705,-0.006683,0.007382,-0.008275
100007,A,0.009241,0.001942,0.001383,0.003692,0.005884,-0.008827,0.009193,0.003814,-0.000150,...,0.000204,-0.000123,0.001166,-0.001115,0.000544,0.000086,-0.000286,0.001538,-0.001583,0.001512
100008,A,0.004006,0.000432,-0.000801,0.001913,0.003803,-0.005378,0.005042,0.001650,-0.001006,...,-0.001235,-0.003478,0.004891,-0.001354,-0.017108,-0.010786,0.012504,0.018124,-0.018675,0.016229
100009,D,-0.000311,-0.000957,0.000646,0.000288,-0.001182,0.000948,-0.000552,0.000689,-0.001152,...,0.002501,0.003495,-0.005384,0.001544,0.015768,0.010960,-0.011133,-0.014667,0.018942,-0.015950
100010,C,-0.003114,0.001502,0.000630,0.000386,-0.003454,0.007496,-0.004005,-0.001242,0.001383,...,-0.000411,-0.001527,0.002398,-0.000100,-0.007171,-0.003743,0.003688,0.004514,-0.007346,0.004225


In [None]:
question = vd[tfcn_for['question']].sum()
correct_answer = np.sum([vd[tfcn_for['answer' + vd.loc[i, 'correctAnswer']]].as_matrix() for i in vd.index])

In [33]:
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score

vs = vd.as_matrix()

lsvc = LinearSVC()

cross_val_score(lsvc, vs[:, 1:], vs[:, 0], cv=10, scoring='accuracy').mean()

0.25751726313335299