# Import questions 

In [None]:
%reload_ext autoreload
%autoreload 2

In [2]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from datetime import datetime
from pathlib import Path

%matplotlib inline
sns.set_style('white')
logging.disable(logging.WARNING)

In [18]:
dirpath = Path("data")
df = pd.read_json(dirpath / "questions.json").T
df.index = [int(i.timestamp()) for i in df.index]

In [19]:
df.head()

Unnamed: 0,sentence,tags
70730831,What&#39;s the mathematical reason behind Pyth...,"[python, c++, python-3.x, rounding, integer-di..."
70793490,How do I calculate square root in Python?,"[python, math, sqrt]"
70837397,"Good alternative to Pandas .append() method, n...","[python, pandas, dataframe, data-wrangling, da..."
70851048,Does it make sense to use Conda + Poetry?,"[python, machine-learning, package, conda, pyt..."
70882092,Can we make 1 == 2 true?,"[python, cpython, python-internals]"


# Preprocessing

In [81]:
import re
import spacy

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [112]:
def clean_text(text: str) -> str:
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

def preprocessing(
        text: str, 
        *,
        pos: list[str] = ["NOUN", "ADJ", "VERB", "ADV"],
        kind: str = "bow", # "bow" or "dl"
        lemma: bool = False,
        package: str = "en_core_web_sm",
) -> str:
    """
    Preprocess a text by converting it to lower case, filtering alphanumeric characters and postags, and lemmatizing the tokens.

    Args:
        text (str): the string to be preprocessed
        pos (list[str], optional): the postags to keep in the text. Defaults to ["NOUN", "ADJ", "VERB", "ADV"].
        kind (str, optional): the kind of preprocessing, either "bow" (bag-of-words) or "dl" (deep learning). Defaults to "bow".
        package (str, optional): the spaCy package to import for NLP. Defaults to "en_core_web_sm".

    Raises:
        ValueError: incompatible values for lemma and kind.
        ValueError: unknown kind argument.

    Returns:
        str: the preprocessed version of the input text.
    """
    # Manage incorrect arguments
    if kind == "dl" and lemma: 
        raise ValueError('Incompatible values for lemma and kind.')
    if kind not in ["bow", "dl"]:
        raise ValueError("Unknown argument:", kind)
    
    # Load the spacy package
    nlp = spacy.load(package, disable=["parser", "ner"])

    # For deep learning, only remove the non-alphanumeric characters
    if kind == "dl": 
        return clean_text(text.lower()).replace("  ", " ").strip()

    # For bag of words, remove the stop words and lemmatize if indicated
    else:
        output_text = " ".join([
            eval("token." + ("lemma_" if lemma else "text")) 
            for token in nlp(text.lower()) if token.pos_ in pos
        ])
        return output_text

In [None]:
df['sentence_bow'] = df['sentence'].apply(lambda x : preprocessing(x))
df['sentence_bow_lem'] = df['sentence'].apply(lambda x : preprocessing(x, lemma=True))
df['sentence_dl'] = df['sentence'].apply(lambda x : preprocessing(x, kind="dl"))

df['length_bow'] = df['sentence_bow'].apply(lambda x : len(x.split()))
df['length_dl'] = df['sentence_dl'].apply(lambda x : len(x.split()))