In [None]:
import pandas as pd

df = pd.read_csv("data/mental_health.csv")
df.head()

In [None]:
df = df.sample(n=1000, random_state=1).reset_index()

In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")
nlp.pipeline

on i711 at 4GHz: 6 mins

In [None]:
df["nlp"] = df["text"].apply(nlp)
df.head()

### Spacy Basic NLP

- **.text**	The original word text
- **.lemma_**	The base form of the word
- **.pos_**	The simple part-of-speech tag
- **.tag_**	The detailed part-of-speech tag
- **.shape_**	The word shape – capitalization, punctuation, digits
- **.is_alpha**	Is the token an alpha character?
- **.is_stop**	Is the token part of a stop list, i.e. the most common words of the language?

(https://towardsdatascience.com/hands-on-implementation-of-basic-nlp-techniques-nltk-or-spacy-687099e02816)

In [None]:
document = df.loc[0].at["nlp"]
print(document.text)

In [None]:
for token in document:
    print(token.text, ": ", 
          token.tag_, " (", 
          spacy.explain(token.tag_), ")")

In [None]:
pos_counts = document.count_by(spacy.attrs.POS)
print(pos_counts)
for k,v in sorted(pos_counts.items()):
    print(k, document.vocab[k].text, v)

In [None]:
for entity in document.ents:
    print(entity.text, ": ", 
          entity.label_, " (", 
          spacy.explain(entity.label_), ")")

In [None]:
from spacy import displacy

displacy.render(document, style="dep", jupyter=True)

### Statistics per Label

In [None]:
from tqdm import tqdm

def create_histogram(df_X):
    result = {}
    for _, row in  tqdm(df_X.iterrows()):
        for entity in row["nlp"].ents:
            current = result.get(entity.label_)
            if (current is None):
                current = 1
            else:
                current = current + 1
            result[entity.label_] = current

    return result

histogram_0 = create_histogram(df[df["label"] == 0])
histogram_0 = sorted(histogram_0.items(), key=lambda item:item[0])
histogram_1 = create_histogram(df[df["label"] == 1])
histogram_1 = sorted(histogram_1.items(), key=lambda item:item[0])

In [None]:
from matplotlib import pyplot as plt

figure = plt.figure()
figure.set_figwidth(20)

plt.bar([item[0] for item in histogram_0], [item[1] for item in histogram_0])
plt.show()

In [None]:
figure = plt.figure()
figure.set_figwidth(20)

plt.bar([item[0] for item in histogram_1], [item[1] for item in histogram_1])
plt.show()

### Fine-Tuning Dataset for Question Answering

In [None]:
document = nlp("bradley cooper started the car and drove away")
print(document.text)
for entity in document.ents:
    print(entity, " - ", entity.label_)

In [None]:
question_words = {
    "PERSON": "Who",
    "TIME": "When",
    "GPE": "Where"
}

In [None]:
for entity in document.ents:
    question_word = question_words.get(entity.label_)
    if (question_word is not None):
        question_sentence = document.text.replace(entity.text, question_word)
        answer_sentence = entity.text
        print("<item><q>", question_sentence, "</q><a>", answer_sentence, "</a></item>")