# Classification Using Word Embedding

- Obtaining and loading data
- Exploring the data
- Machine learning
    - Split data
    - Create word embedding
    - Vectorisation: average word embedding per document
    - Model fiting
    - Model evaluation
- Apply model (do one prediction)

In [None]:
import pandas as pd

df = pd.read_csv("data/mental_health.csv")
df.head()

### Exploration

In [None]:
import matplotlib.pyplot as plt
# import seaborn as sns

fig, ax = plt.subplots()
df["label"].reset_index().groupby("label").count().plot(
    kind="barh", legend=False, ax=ax).grid(axis='x')
plt.show()

### Cleaning

In [None]:
import re
import nltk
# nltk.download("stopwords")
# nltk.download("wordnet")
# nltk.download("punkt")

def clean(text, stopwords):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    text_list = text.split()
    text_list = [word for word in text_list if word not in stopwords]

    lematizer = nltk.stem.wordnet.WordNetLemmatizer()
    text_list = [lematizer.lemmatize(word) for word in text_list]

    text = " ".join(text_list)
    return text

stopwords = nltk.corpus.stopwords.words("english")
df["text_clean"] = df["text"].apply(
    lambda x:
        clean(x, stopwords)
)

df.head()

### Modelling

In [None]:
from sklearn.model_selection import train_test_split

lambda_reset = lambda x: x.reset_index()
df_train, df_test = [
    lambda_reset(item) for item in train_test_split(df, test_size=0.2)]

y_train = df_train["label"].values
y_test = df_test["label"].values

#### Word Embedding: Do the following only to save embedding to disk

In [None]:
import gensim.models.phrases

def create_list_corpus(column):
    # create list of lists of unigrams
    result = []
    for string in column:
        list_of_words = string.split()
        list_of_unigrams = [" ".join(list_of_words[i:i+1])
                            for i in range(0, len(list_of_words), 1)]
        result.append(list_of_unigrams)

    ## detect bigrams and trigrams
    bigrams_detector = gensim.models.phrases.Phrases(result, 
                  delimiter=" ", min_count=5, threshold=10)
    bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
    trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[result], 
               delimiter=" ", min_count=5, threshold=10)
    trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

    return result

list_corpus_train = create_list_corpus(df_train["text_clean"])
list_corpus_test = create_list_corpus(df_test["text_clean"])

# just to get a useful window size for the embedding model
avg_length = 0 if len(list_corpus_train) == 0 else sum(
        [len(member) for member in list_corpus_train]
    ) / len(list_corpus_train)
print(
    "Nof: ", len(list_corpus_train), 
    "Max: ", max([len(member) for member in list_corpus_train]), 
    "Min: ", min([len(member) for member in list_corpus_train]), 
    "Avg: ", avg_length
    )

In [None]:
from gensim.models.word2vec import Word2Vec

embedding = Word2Vec(window=72, workers=6, epochs=20, sg=1)
embedding.build_vocab(list_corpus_train, progress_per=100)
embedding.train(
    list_corpus_train, 
    total_examples=embedding.corpus_count, 
    epochs=embedding.epochs)

embedding.save("results/embedding.w2v")

In [None]:
import matplotlib.pyplot as plt
from sklearn import manifold

word = "happy"

fig = plt.figure(figsize=(12, 12))
## word embedding
tot_words = [word] + [tupla[0] for tupla in 
                 embedding.wv.most_similar(word, topn=21)]
X = embedding.wv[tot_words]
## pca to reduce dimensionality from 100 to 3
pca = manifold.TSNE(perplexity=12, n_components=3, init='pca')
X = pca.fit_transform(X)
## create dtf
dtf_ = pd.DataFrame(X, index=tot_words, columns=["x","y","z"])
## plot 3d
from mpl_toolkits.mplot3d import Axes3D
ax = fig.add_subplot(111, projection='3d')
ax.scatter(dtf_['x'], 
           dtf_['y'], 
           dtf_['z'], c="black")
           
for label, row in dtf_[["x","y","z"]].iterrows():
    x, y, z = row
    ax.text(x, y, z, s=label)

### From here on: work with saved embedding

In [None]:
from gensim.models.word2vec import Word2Vec

embedding_from_disk = Word2Vec.load("results/embedding.w2v")

In [None]:
import numpy as np

def average_embedding(embedding, individual_text):
    words = [word for word in individual_text if 
                word in embedding.wv.key_to_index]
    if len(words) > 0:
        return np.mean(embedding.wv[words], axis=0)
    else:
        print("> Empty :-( ", individual_text)
        return np.zeros(embedding.wv.vector_size) # TODO useful behaviour?

print(df_train.loc[0].at["text_clean"])
print(average_embedding(
    embedding_from_disk, df_train.loc[0].at["text_clean"]))
print(y_train[:1])

In [None]:
X_train_awes = [average_embedding(embedding_from_disk, current) 
                for current in df_train["text_clean"]]
X_test_awes = [average_embedding(embedding_from_disk, current) 
               for current in df_test["text_clean"]]

print(X_train_awes[:1])

#### Actual Training

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train_awes, y_train)

### Evaluation

In [None]:
from sklearn import metrics

predictions = model.predict(X_test_awes)
accuracy = metrics.accuracy_score(y_true=y_test, y_pred=predictions)
confusion = metrics.confusion_matrix(y_true=y_test, y_pred=predictions)
print(accuracy)
print(confusion)

### Application

In [None]:
average_of_text = [average_embedding(
    embedding_from_disk, df_train.loc[0].at["text_clean"])]
print(average_of_text)
a = model.predict(average_of_text)
print(a)