# Baseline #

Первое решение я решил строить на основе датасета "SciQ Dataset"
https://allenai.org/data/sciq

In [6]:
import json
import random
from sklearn.model_selection import cross_val_score

Загружаем данные:

In [7]:
PATH_TO_SCIQ_DATASET = 'data/SciQ dataset/'
TEST_FILENAME = 'test.json'
TRAIN_FILENAME = 'train.json'
VALIDATION_FILENAME = 'valid.json'

with open(PATH_TO_SCIQ_DATASET+TEST_FILENAME,'r') as json_file:
    test_data =  json.load(json_file)

with open(PATH_TO_SCIQ_DATASET+TRAIN_FILENAME,'r') as json_file:
    train_data =  json.load(json_file)

with open(PATH_TO_SCIQ_DATASET+VALIDATION_FILENAME,'r') as json_file:
    validation_data =  json.load(json_file)

    
print('test_data:',len(test_data))
print('train_data:',len(train_data))
print('validation_data:',len(validation_data))

test_data: 1000
train_data: 11679
validation_data: 1000


```json5
{
    "question": "Compounds that are capable of accepting electrons, such as o 2 or f2, are called what?",
    "distractor3": "residues",
    "distractor1": "antioxidants",
    "distractor2": "Oxygen",
    "correct_answer": "oxidants",
    "support": "Oxidants and Reductants Compounds that are capable of accepting electrons, such as O 2 or F2, are calledoxidants (or oxidizing agents) because they can oxidize other compounds. In the process of accepting electrons, an oxidant is reduced. Compounds that are capable of donating electrons, such as sodium metal or cyclohexane (C6H12), are calledreductants (or reducing agents) because they can cause the reduction of another compound. In the process of donating electrons, a reductant is oxidized. These relationships are summarized in Equation 3.30: Equation 3.30 Saylor URL: http://www. saylor. org/books."
}
```
Я преобразую данные в два вектора: в первом будут варианты ответов (distractor1, distractor2, distractor3, correct_answer) и фичи, которые я извлеку из support. Например, встречается ли каждый из этих вариантов ответа в тексте?
Вторым вектором будет вектор метор, где `True` означает правильный ответ, а `False` - дистрактор: [False False False True]
Также перед добавлением вариантов ответа в общий вектор, я буду их перемешивать.

In [8]:

def get_answer_features(answer, support):
    features = {}
    if support.lower().find(answer.lower())>-1:
        features['present_in_support'] = True
    else:
        features['present_in_support'] = False
    return features

def get_question_features(question_data):
    
    question = question_data["question"]
    distractors = [question_data["distractor1"], question_data["distractor2"],question_data["distractor3"]]
    correct_answer = question_data["correct_answer"]
    support = question_data["support"]
    answers = distractors + [correct_answer]
    random.shuffle(answers)
    
    features = []
    labels = []
    
    for answer in answers:
        features.append(get_answer_features(answer, support))
        if answer == correct_answer:
            labels.append(True)
        else:
            labels.append(False)
        
    return (features,labels)

get_question_features(test_data[105])

([{'present_in_support': False},
  {'present_in_support': False},
  {'present_in_support': True},
  {'present_in_support': True}],
 [False, False, True, False])

In [9]:
train_data_features = []
train_data_labels = []
test_data_features = []
test_data_labels = []

for question in train_data:
    local_data = get_question_features(question)
    train_data_features.extend(local_data[0])
    train_data_labels.extend(local_data[1])

for question in test_data:
    local_data = get_question_features(question)
    test_data_features.extend(local_data[0])
    test_data_labels.extend(local_data[1])

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vectorizer = DictVectorizer()
vec = vectorizer.fit(train_data_features)
#print("Total number of features: ", len(vec.get_feature_names()))

train_features_vectorized = vec.transform(train_data_features)
test_features_vectorized = vec.transform(test_data_features)

lrc = LogisticRegression(random_state=42, solver="sag", multi_class="multinomial",
                             max_iter=50, verbose=1)

lrc.fit(train_features_vectorized, train_data_labels)

scores = cross_val_score(lrc, train_features_vectorized, train_data_labels, cv=5)

print ('Cross validation:')
print (scores)

print ('classification_report:')
predicted_labels = lrc.predict(test_features_vectorized)
print(classification_report(test_data_labels, predicted_labels))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


convergence after 17 epochs took 0 seconds
convergence after 17 epochs took 0 seconds
convergence after 19 epochs took 1 seconds
convergence after 15 epochs took 0 seconds
convergence after 16 epochs took 0 seconds
convergence after 17 epochs took 0 seconds
Cross validation:
[0.92583476 0.92550573 0.92614792 0.92122445 0.92518463]
classification_report:
              precision    recall  f1-score   support

       False       0.94      0.95      0.95      2998
        True       0.86      0.82      0.84      1002

    accuracy                           0.92      4000
   macro avg       0.90      0.89      0.89      4000
weighted avg       0.92      0.92      0.92      4000



## Вывод ##
В датасете SciQ в большинстве "support" предложений дистракторы отсутвуют, а правильный вариант ответа записан абсолютно так же, как он указан в вопросах.
Поэтому простой фичи "есть ли вариант ответа в "support" предложении" достаочно для получения довольно высокой точности.

## Следующие шаги ##

1. Написать парсер, который найдет тексты, из которых взяты "support" предложения и дополнить ими датасет.
2. Натренировать модель находить support sentence для вопросов "What is/are ..."  