In [1]:
!mkdir covid
!mkdir pre_covid

In [2]:
import os
import re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm

In [3]:
tqdm.pandas()

# Helper Functions

In [4]:
TAG_RE = re.compile('<[^>]*>')

# Currently unused:
CODE_RE = re.compile(r'<code>((.|\n|\r)*)</code>')
SPACE_RE = re.compile(r"(\n|\r)")
TAB_RE = re.compile(r"\t")


def clean(text):
    text = re.sub(TAG_RE, '', text)    
    return text

In [5]:
def negative_sampling(positives, negatives, size=1, random_state=0):
    """
    if you are sampling negatives within the same array, simply run
    `negative_sampling(positives, positives)`.
    
    input:
        positives (np.array): We are iterating through this array
        negatives (np.array): The negative candidates from which we will sample
        size (int): number of negative samples for every positive
        random_state (int): seed for numpy's random
    returns:
        choices (np.array): All the sampled negatives (i.e. they don't
        have the same value as the currently sampled positive)
    """
    np.random.seed(random_state)
    choices = []

    for val in tqdm(positives, leave=False):
        rest = negatives[negatives != val]
        choice = np.random.choice(rest, size=size, replace=False)
        choices.append(choice)

    return np.stack(choices).squeeze()

# Load Data

In [6]:
group = 'biomedical'

In [7]:
site = 'biology'
qdf = pd.read_csv(f'/kaggle/input/stackexchange-raw-qa/{group}/{site}_questions.csv')
adf = pd.read_csv(f'/kaggle/input/stackexchange-raw-qa/{group}/{site}_answers.csv')

qdf.Body = qdf.Body.apply(clean)
adf.Body = adf.Body.apply(clean)

# First find the good and bad answers

The good answers are the ones that have a score of 2 or more. the bad one have a score of 0 or less.

In [8]:
accepted_answer_ids = qdf.AcceptedAnswerId.dropna().unique()

In [9]:
best_answers = (
    adf
    .sort_values(by=['Score'], ascending=False)
    .drop_duplicates('ParentId')
    .query("Score > 1")
)
print(best_answers.shape)

bad_answers = (
    adf
    .loc[~adf.Id.isin(accepted_answer_ids)] # An accepted answer can't be bad
    .sort_values(by=['Score'], ascending=True) # Take the one with the worst score
    .drop_duplicates('ParentId') # Drop every other options
    .query("Score < 1")
    .loc[:, ['ParentId', 'Body']]
    .rename(columns={'Body': 'WrongAnswer'})   
)
bad_answers['WrongAnswerType'] = 'Bad Answers'
print(bad_answers.shape)

(13406, 6)
(2796, 3)


# Merge with Accepted answers

If a question has an accepted answer, we merge that answer with the question dataframe.

In [10]:
accepted = (
    qdf.merge(
        adf, how='inner', 
        left_on='AcceptedAnswerId', right_on='Id', 
        suffixes=("Question", "Answer"))
    .drop(columns=['ParentId', 'AcceptedAnswerId'])
)
accepted['AnswerType'] = 'Accepted'
print(accepted.shape)
accepted.head()

(10522, 12)


Unnamed: 0,IdQuestion,Title,BodyQuestion,CreationDateQuestion,ScoreQuestion,OwnerUserIdQuestion,IdAnswer,BodyAnswer,CreationDateAnswer,ScoreAnswer,OwnerUserIdAnswer,AnswerType
0,13408,Why are interspecies hybrids so often larger t...,Recently doing a little reading on interspecie...,2013-11-10 03:15:02,5,4884.0,13409,While they do not necessarily constitute the m...,2013-11-10 03:59:39,4,4882.0,Accepted
1,13412,Body's decomposition,Does a human body decompose in a completely st...,2013-11-10 12:12:54,1,4889.0,13424,\n Does a human body decompose in a completel...,2013-11-11 12:07:13,1,3340.0,Accepted
2,13414,A weird-looking fish with a shield,I found this photo in a local news page. The p...,2013-11-10 17:01:13,17,,13443,The front part of the fish looks like part of ...,2013-11-12 08:40:36,11,1486.0,Accepted
3,13421,Why does caffeine consumption cause one to uri...,What is the physiological mechanism behind the...,2013-11-11 07:23:27,4,1472.0,13431,Caffeine inhibits the secretion of the antidiu...,2013-11-11 17:32:45,6,4025.0,Accepted
4,13425,Telomere shortening during replication,It is widely know that each cell cycle during ...,2013-11-11 12:49:24,3,3657.0,13434,"In replication, both the chromosomal halves (w...",2013-11-11 18:46:42,3,3615.0,Accepted


# Merge with Reasonable Answers

If a question doesn't have an accepted answer, we merge the question dataframe with the best (most upvoted) answer available.

In [11]:
reasonable = (
    qdf
    .loc[qdf.AcceptedAnswerId.isna()]
    .merge(
        best_answers, how='inner', 
        left_on='Id', right_on='ParentId', 
        suffixes=("Question", "Answer"))
    .drop(columns=['ParentId', 'AcceptedAnswerId'])
)
reasonable['AnswerType'] = 'Reasonable'
print(reasonable.shape)
reasonable.head()

(5032, 12)


Unnamed: 0,IdQuestion,Title,BodyQuestion,CreationDateQuestion,ScoreQuestion,OwnerUserIdQuestion,IdAnswer,BodyAnswer,CreationDateAnswer,ScoreAnswer,OwnerUserIdAnswer,AnswerType
0,13415,What makes a DNA sequence motif a DNA sequence...,"I've read about DNA sequence motifs, but still...",2013-11-10 19:46:14,7,284.0,13422,Motif means a pattern. For any array of object...,2013-11-11 07:56:45,4,3340.0,Reasonable
1,13416,Why are antibodies so expensive?,The amplification on vector transgenic microor...,2013-11-10 19:52:47,3,3657.0,14079,Making antibodies is mostly nothing you do wit...,2013-12-20 08:04:32,6,5144.0,Reasonable
2,13418,Why is the 4.4 kb band barely visible in this ...,Why is the 4.4 kb band barely visible? \n\nAls...,2013-11-10 22:34:27,1,4892.0,13428,"First of all, the 4.4 kb HindIII band is deriv...",2013-11-11 14:35:21,2,1320.0,Reasonable
3,13420,explanation of conjugation experiment in bacteria,In this paper (On the expression of a structur...,2013-11-11 00:37:00,1,1693.0,13437,There are three variables being shown here: \n...,2013-11-11 23:02:20,2,4840.0,Reasonable
4,13436,Why can CO₂ be transported dissolved in plasma?,Is it because CO2 is more soluble in water or ...,2013-11-11 19:09:15,1,3615.0,13450,The bulk of CO2 is transported as bicarbonate/...,2013-11-12 14:19:00,3,4908.0,Reasonable


# Combine results

Now, we combine both the accepted qa dataframe with the reasonable qa dataframe. We will also merge that combined dataframe with the bad answers we defined earlier, so that we have both a correct (accepted) and incorrect (low votes) answer for every question. Note that the new wrong answer column will have missing values; this will be corrected in the next cell.

In [12]:
combined = pd.concat([accepted, reasonable]).reset_index(drop=True)
combined = (
    combined
    .merge(
        bad_answers, how='left',
        left_on='IdQuestion', right_on='ParentId')
    .drop(columns=['ParentId'])
)

Since not all qa pairs will have a "bad" answer, for the rows were the "wrong answer" is missing, we randomly sample a negative example from the set of answers.

In [13]:
unique_ans = adf.drop_duplicates('ParentId')
negative_ids = negative_sampling(combined.IdQuestion.values, unique_ans.ParentId.values)

wrong_answers = (
    unique_ans.set_index("ParentId")
    .loc[negative_ids, 'Body']
    .reset_index(drop=True)
)

HBox(children=(FloatProgress(value=0.0, max=15554.0), HTML(value='')))



In [14]:
combined.WrongAnswerType.fillna("Random", inplace=True)
combined.WrongAnswer.fillna(wrong_answers, inplace=True)

# Post processing

Rename columns, remove columns that are not useful

In [15]:
combined['site'] = site

In [16]:
combined.drop(
    columns=[
        'CreationDateQuestion', 'ScoreQuestion', 
        'OwnerUserIdQuestion', 'CreationDateAnswer', 
        'ScoreAnswer'
    ], inplace=True
)

In [17]:
combined.rename(inplace=True, columns={
    'IdQuestion': 'question_id',
    'Title': 'title',
    'BodyQuestion': 'question',
    'IdAnswer': 'answer_id',
    'BodyAnswer': 'answer',
    'WrongAnswer': 'wrong_answer',
    'AnswerType': 'answer_type',
    'WrongAnswerType': 'wrong_answer_type'
})

# Preprocess all stackexchange

In [18]:
def process_qa_dataframes(qdf, adf, site):
    """
    given a question and an answer dataframes, we process them 
    and combine them, then add negative answers, and format it.
    """
    accepted_answer_ids = qdf.AcceptedAnswerId.dropna().unique()

    best_answers = (
        adf
        .sort_values(by=['Score'], ascending=False)
        .drop_duplicates('ParentId')
        .query("Score > 1")
    )

    bad_answers = (
        adf
        .loc[~adf.Id.isin(accepted_answer_ids)] # An accepted answer can't be bad
        .sort_values(by=['Score'], ascending=True) # Take the one with the worst score
        .drop_duplicates('ParentId') # Drop every other options
        .query("Score < 1")
        .loc[:, ['ParentId', 'Body']]
        .rename(columns={'Body': 'WrongAnswer'})   
    )
    bad_answers['WrongAnswerType'] = 'Bad Answers'

    # Merge with Accepted answers
    accepted = (
        qdf.merge(
            adf, how='inner', 
            left_on='AcceptedAnswerId', right_on='Id', 
            suffixes=("Question", "Answer"))
        .drop(columns=['ParentId', 'AcceptedAnswerId'])
    )
    accepted['AnswerType'] = 'Accepted'

    # Merge with Reasonable Answers
    reasonable = (
        qdf
        .loc[qdf.AcceptedAnswerId.isna()]
        .merge(
            best_answers, how='inner', 
            left_on='Id', right_on='ParentId', 
            suffixes=("Question", "Answer"))
        .drop(columns=['ParentId', 'AcceptedAnswerId'])
    )
    reasonable['AnswerType'] = 'Reasonable'

    # Combine results
    combined = pd.concat([accepted, reasonable]).reset_index(drop=True)
    combined = (
        combined
        .merge(
            bad_answers, how='left',
            left_on='IdQuestion', right_on='ParentId')
        .drop(columns=['ParentId'])
    )
    
    # Random sampling for rows that do not have "bad" answers
    unique_ans = adf.drop_duplicates('ParentId')
    negative_ids = negative_sampling(combined.IdQuestion.values, unique_ans.ParentId.values)

    wrong_answers = (
        unique_ans.set_index("ParentId")
        .loc[negative_ids, 'Body']
        .reset_index(drop=True)
    )

    combined.WrongAnswerType.fillna("Random", inplace=True)
    combined.WrongAnswer.fillna(wrong_answers, inplace=True)

    # Post processing
    combined['site'] = site

    combined.drop(
        columns=[
            'CreationDateQuestion', 'ScoreQuestion', 
            'OwnerUserIdQuestion', 'CreationDateAnswer', 
            'ScoreAnswer', 'OwnerUserIdAnswer'
        ], inplace=True
    )

    combined.rename(inplace=True, columns={
        'IdQuestion': 'question_id',
        'Title': 'title',
        'BodyQuestion': 'question',
        'IdAnswer': 'answer_id',
        'BodyAnswer': 'answer',
        'WrongAnswer': 'wrong_answer',
        'AnswerType': 'answer_type',
        'WrongAnswerType': 'wrong_answer_type'
    })
    
    return combined

# Process each group

In [19]:
dfs = {}
for group in ['biomedical', 'expert', 'general']:
    sites = list(set([x.split("_")[0] for x in os.listdir(f'/kaggle/input/stackexchange-raw-qa/{group}/')]))

    combined_ls = []

    for site in tqdm(sites, desc=group):
        qdf = pd.read_csv(f'/kaggle/input/stackexchange-raw-qa/{group}/{site}_questions.csv')
        adf = pd.read_csv(f'/kaggle/input/stackexchange-raw-qa/{group}/{site}_answers.csv')

        qdf.Body = qdf.Body.apply(clean)
        adf.Body = adf.Body.apply(clean)

        combined = process_qa_dataframes(qdf, adf, site)
        combined_ls.append(combined)

    all_df = pd.concat(combined_ls)
    dfs[group] = all_df

HBox(children=(FloatProgress(value=0.0, description='biomedical', max=3.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, max=15554.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3367.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1976.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='expert', max=12.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, max=6587.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16405.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10649.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=8794.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4385.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=22656.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=22318.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9922.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=7980.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14371.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5532.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10419.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='general', max=16.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, max=6739.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4142.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5503.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3334.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3380.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=18551.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3531.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6303.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16783.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10665.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4624.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10125.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=8316.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12537.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9473.0), HTML(value='')))




In [20]:
for group in ['biomedical', 'expert', 'general']:
    df = dfs[group]
    text = df.question.str.lower() + df.title.str.lower() + df.answer.str.lower()

    mask = (
        text.str.contains('covid') |
        text.str.contains('coronavirus') |
        text.str.contains('corona virus') |
        text.str.contains('2019-ncov') |
        text.str.contains('2019 ncov') |
        text.str.contains('sars-cov') |
        text.str.contains('sars cov')
    )
    
    df['group'] = group
    df[mask].to_csv(f"covid/{group}.csv")
    df[~mask].to_csv(f"pre_covid/{group}.csv")
    print(df[mask].shape)

(196, 10)
(146, 10)
(300, 10)


In [21]:
dfs

{'biomedical':       question_id                                              title  \
 0           13408  Why are interspecies hybrids so often larger t...   
 1           13412                               Body's decomposition   
 2           13414                 A weird-looking fish with a shield   
 3           13421  Why does caffeine consumption cause one to uri...   
 4           13425             Telomere shortening during replication   
 ...           ...                                                ...   
 1971         7292  Pipeline for extracting gene from multiple gen...   
 1972         7305  Differential expression analysis when nested e...   
 1973         7311  How to extract gene expression tables from thi...   
 1974         7318  RNA-seq analysis of mixed viral/host reads wit...   
 1975        11345  How to translate amino acid sequences to Nucle...   
 
                                                question  answer_id  \
 0     Recently doing a little readin