## Preface
This is some basic config stuff.

In [None]:
import random

from topfin.util import do_spacy_stuff, load_spacy
%load_ext autoreload
%autoreload 2
import os

print(os.getcwd())
os.chdir('../')

In [None]:
do_spacy_stuff()
docs = load_spacy('data/spacy2.bin')


This following code is taken from [here](https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py).
It is used to  make these nice topic analysis plots.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt


def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx + 1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

These are parameters for the LDA algorithm, to make sure that the topic distribution looks
somewhat sensible.

In [None]:
n_features = 1000  # Max words to consider
max_df = 0.60  # Consider words that only appear in at most 60% of the docs.
min_df = 5  # Consider words that appear at least 5 times across all documents.
n_topics = 10  # number of topics
n_top_words = 10  # number of words to display per topic
max_iter = 5  # How long to train for
texts_won = [' '.join(t.lemma_ for t in d) for d in docs if d._.won]
texts_lost = [' '.join(t.lemma_ for t in d) for d in docs if not d._.won]

## LDA
This is the code for the actual LDA algorithm.

In [None]:
def do_lda(texts, max_df, min_df, n_features, n_topics, max_iter, idf=False):
    if idf:
        tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                        max_features=n_features,
                                        stop_words='english')

    else:
        tf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                        max_features=n_features,
                                        stop_words='english')

    tf_matrix = tf_vectorizer.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,
                                    learning_method='online',
                                    learning_offset=30.,
                                    random_state=0)
    lda.fit(tf_matrix)
    tf_feature_names = tf_vectorizer.get_feature_names()
    return lda, tf_feature_names

This is the result of applying LDA on won topics:

In [None]:
lda, words = do_lda(texts_won, max_df, min_df, n_features, n_topics, max_iter, idf=True)
plot_top_words(lda, words, n_top_words, 'LDA on won contracts')

This is the result of applying LDA on lost topics:

In [None]:
lda, words = do_lda(texts_lost, max_df, min_df, n_features, n_topics, max_iter, idf=True)
plot_top_words(lda, words, n_top_words, 'LDA on lost contracts')

We see a slight difference in topics. Won contracts seem to mention topics related to golf,
water and fishing and local community.

## Word-level analysis

This is some pre-processing code.

In [None]:
import re

exclude = {"CARDINAL", "ORDINAL", "PERCENT", "TIME", "QUANTITY", "DATE"}

ners_won = ['|'.join(str(e) for e in d.ents if e.label_ not in exclude) for d in docs if d._.won]
ners_lost = ['|'.join(str(e) for e in d.ents if e.label_ not in exclude) for d in docs if not d._.won]


def my_tokenizer(text):
    return re.split("\|", text)


def do_lda_on_ner(texts, max_df, min_df, n_features, n_topics, max_iter, idf=False):
    if idf:
        tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                        max_features=n_features,
                                        tokenizer=my_tokenizer)

    else:
        tf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                        max_features=n_features,
                                        tokenizer=my_tokenizer)

    tf_matrix = tf_vectorizer.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,
                                    learning_method='online',
                                    learning_offset=30.,
                                    random_state=0)
    lda.fit(tf_matrix)
    tf_feature_names = tf_vectorizer.get_feature_names()
    return lda, tf_feature_names

What follows, is an application of the LDA algorithm when only considering named entities.
The results are rather inconclusive.

In [None]:
lda, words = do_lda_on_ner(ners_won, 1.0, 5, 100, 3, max_iter, idf=False)
plot_top_words(lda, words, n_top_words, 'Won contracts (NER)')

In [None]:
lda, words = do_lda_on_ner(ners_lost, 1.0, 5, 100, 3, max_iter, idf=False)
plot_top_words(lda, words, n_top_words, 'Lost contracts (NER)')

More processing/visualisation code.

In [None]:
def print_barchart_won_lost(labels_won, labels_lost, vals_won, vals_lost, title):
    fig, axes = plt.subplots(2, 1, figsize=(10, 10), sharex=True)
    axes.flatten()
    axes[0].barh(labels_won, vals_won)
    #range(len(labels_won)), vals_won)
    #axes[0].yticks(ticks=range(len(labels)), labels=labels, )
    axes[0].set_title("won")
    axes[1].barh(labels_lost, vals_lost)
    # range(len(labels_lost)), vals_lost)
    #axes[1].yticks(ticks=range(len(labels_lost)), labels=labels, )
    axes[1].set_title("lost")
    # plt.barh(range(len(labels)), vals)
    # plt.ylabel("Labels")
    # plt.xlabel("Number of Docs mentioning the word")
    # plt.yticks(ticks=range(len(labels)), labels=labels, )
    fig.suptitle(title)
    #plt.hist(top_20)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.01)
    plt.show()



In [None]:
def normalise(d: dict, ratio):
    return Counter({k: v * ratio for k, v in d.items()})


won_lost_ratio = sum(1 for d in docs if d._.won) / sum(1 for d in docs if not d._.won)
assert sum(1 for d in docs if not d._.won) * won_lost_ratio == sum(1 for d in docs if d._.won)

In [None]:
from collections import Counter
import numpy as np


def get_most_common(doc_repr, n_features=1000, df_ratio=1.):
    tf_vectorizer = CountVectorizer(max_features=n_features, tokenizer=my_tokenizer, max_df=df_ratio)
    tf_counts = tf_vectorizer.fit_transform(doc_repr).toarray()
    ctr = Counter(dict(zip(tf_vectorizer.get_feature_names(), np.sum(tf_counts, axis=0))))
    return ctr

What follows here, is a word class analysis preceded by named entity analysis.

In [None]:
ctr_won = get_most_common(ners_won)
labels, vals = zip(*ctr_won.most_common(20))

ctr_lost = get_most_common(ners_lost)
labels_lost, vals_lost = zip(*normalise(ctr_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "entities in contracts")

In [None]:
adjs_won = ['|'.join(str(e.lemma_) for e in d if e.pos_ == 'ADJ' and str(e) != '-') for d in docs if d._.won]
adjs_lost = ['|'.join(str(e.lemma_) for e in d if e.pos_ == 'ADJ' and str(e) != '-') for d in docs if not d._.won]

In [None]:
ctr_adjs_won = get_most_common(adjs_won, df_ratio=0.5)
labels, vals = zip(*ctr_adjs_won.most_common(20))

ctr_adjs_lost = get_most_common(adjs_lost, df_ratio=0.5)
labels_lost, vals_lost = zip(*normalise(ctr_adjs_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "adjectives")

In [None]:
verbs_won = ['|'.join(str(e.lemma_) for e in d if e.pos_ == 'VERB' and str(e) != '-') for d in docs if d._.won]
verbs_lost = ['|'.join(str(e.lemma_) for e in d if e.pos_ == 'VERB' and str(e) != '-') for d in docs if not d._.won]

In [None]:
ctr_verbs_won = get_most_common(verbs_won, df_ratio=0.6)
labels, vals = zip(*ctr_verbs_won.most_common(20))

ctr_verbs_lost = get_most_common(verbs_lost, df_ratio=0.6)
labels_lost, vals_lost = zip(*normalise(ctr_verbs_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "verbs")

In [None]:
adverbs_won = ['|'.join(str(e.lemma_) for e in d if e.pos_ == 'ADV' and str(e) != '-') for d in docs if d._.won]
adverbs_lost = ['|'.join(str(e.lemma_) for e in d if e.pos_ == 'ADV' and str(e) != '-') for d in docs if not d._.won]

In [None]:
ctr_adverbs_won = get_most_common(adverbs_won, df_ratio=.3)
labels, vals = zip(*ctr_adverbs_won.most_common(20))

ctr_adverbs_lost = get_most_common(adverbs_lost, df_ratio=.3)
labels_lost, vals_lost = zip(*normalise(ctr_adverbs_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "verbs")


## Lengths

We see a that won contracts seem to be slightly longer on average.
However, as mentioned before this might be an artefact of the form, as latter forms contain more
fields to fill with content.

In [None]:
import random

lengths_won = [len(d) for d in docs if d._.won]
lengths_lost = [len(d) for d in docs if not d._.won]
fig, axes = plt.subplots(2, 1, figsize=(10, 10), sharey=True, sharex=True)
axes.flatten()
axes[0].hist(lengths_won, bins=50)
axes[0].set_title("# of Token distribution for won contracts")
axes[1].hist(lengths_lost, bins=50, weights=[won_lost_ratio] * len(lengths_lost))
axes[1].set_title("# of Token distribution for lost contracts")

plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
plt.show()

In [None]:
import nltk

stopwords = nltk.corpus.stopwords.words("english")

noun_chunks_won = ['|'.join(str(e) for e in d.noun_chunks if str(e).lower().strip() not in stopwords) for d in docs if
                   d._.won]
noun_chunks_lost = ['|'.join(str(e) for e in d.noun_chunks if str(e).lower().strip() not in stopwords) for d in docs if
                    not d._.won]

In [None]:
ctr_np_won = get_most_common(noun_chunks_won, df_ratio=0.3)
labels, vals = zip(*ctr_np_won.most_common(20))

ctr_np_lost = get_most_common(noun_chunks_lost, df_ratio=0.3)
labels_lost, vals_lost = zip(*normalise(ctr_np_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "Noun Phrases")

In [None]:
n_features = 1000
max_df = 0.60
min_df = 5
n_topics = 10
n_top_words = 10
max_iter = 5
lda, words = do_lda(texts_won + texts_lost, max_df, min_df, n_features, n_topics, max_iter, idf=True)
plot_top_words(lda, words, n_top_words, 'LDA on joint contracts')

## LDA on won/lost contracts (jointly)
Here we model the topics jointly on won/lost contracts and observe whether won/lost contracts
differ in topic.

In [None]:
wons_features = CountVectorizer(max_df=max_df, min_df=min_df,
                                max_features=n_features,
                                stop_words='english').fit_transform(texts_won)
losts_features = CountVectorizer(max_df=max_df, min_df=min_df,
                                 max_features=n_features,
                                 stop_words='english').fit_transform(texts_lost)
topics_won = lda.transform(wons_features)
topics_lost = lda.transform(losts_features)

In [None]:
average_won = np.average(topics_won, axis=0)
average_lost = np.average(topics_lost, axis=0)

In [None]:
plt.plot(range(10), average_won, label="Wons")
plt.plot(range(10), average_lost, label="Losts")
plt.xticks(range(10))
plt.legend(loc='upper right')
plt.show()

In [None]:
import scipy

summed_won = np.sum(topics_won, axis=0)
summed_lost = np.sum(topics_lost, axis=0) * won_lost_ratio

In [None]:
plt.plot(range(10), summed_won, label="Wons")
plt.plot(range(10), summed_lost, label="Losts")
plt.xticks(range(10))
plt.legend(loc='upper right')
plt.show()

In [None]:
import nltk


def get_ngrams(docs, n_features, df_ratio, ngrams=2):
    tf_vectorizer = CountVectorizer(max_features=n_features, max_df=df_ratio, ngram_range=(ngrams, ngrams),
                                    stop_words=nltk.corpus.stopwords.words('english'))
    tf_counts = tf_vectorizer.fit_transform(docs).toarray()
    ctr = Counter(dict(zip(tf_vectorizer.get_feature_names(), np.sum(tf_counts, axis=0))))
    return ctr, tf_counts, tf_vectorizer.get_feature_names()

In [None]:
stuff = 'non confidential information include as much information as possible on the request description of sought after technology/information features field of use'

docs_won = [str(d).split('BACKGROUND')[-1].replace(stuff, '') for d in docs if d._.won]

docs_lost = [str(d).split('BACKGROUND')[-1].replace(stuff, '') for d in docs if not d._.won]

In [None]:
ctr_np_won, *_ = get_ngrams(docs_won, 1000, df_ratio=0.3)
labels, vals = zip(*ctr_np_won.most_common(20))

ctr_np_lost, *_ = get_ngrams(docs_lost, 1000, df_ratio=0.3)
labels_lost, vals_lost = zip(*normalise(ctr_np_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "Bigrams")

In [None]:
ctr_np_won, *_ = get_ngrams(docs_won, 1000, df_ratio=0.3, ngrams=3)
labels, vals = zip(*ctr_np_won.most_common(20))

ctr_np_lost, *_ = get_ngrams(docs_lost, 1000, df_ratio=0.3, ngrams=3)
labels_lost, vals_lost = zip(*normalise(ctr_np_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "Trigrams")


In [None]:
ctr_np_won, *_ = get_ngrams(docs_won, 1000, df_ratio=0.3, ngrams=4)
labels, vals = zip(*ctr_np_won.most_common(20))

ctr_np_lost, *_ = get_ngrams(docs_lost, 1000, df_ratio=0.3, ngrams=4)
labels_lost, vals_lost = zip(*normalise(ctr_np_lost, won_lost_ratio).most_common(20))
print_barchart_won_lost(labels, labels_lost, vals, vals_lost, "4-grams")

In [None]:
from math import log


def pmi(x, y, joint, totals):
    p_xy = joint / totals
    p_x = x / totals
    p_y = y / totals
    return log(p_xy / (p_x * p_y))

In [None]:
ctr_ngrams, tf_counts, feature_names = get_ngrams(docs_won + docs_lost, 1000, df_ratio=0.3, ngrams=4)
print(tf_counts.shape)

In [None]:
most_common = set(x for x, _ in ctr_ngrams.most_common(20))
assert len(tf_counts.T) == len(feature_names)
total = len(docs_won + docs_lost)
y_won = len(docs_won)
y_lost = len(docs_lost)
for counts, features in zip(tf_counts.T, feature_names):
    assert len(counts) == total
    if features in most_common:
        counts = counts.clip(0, 1)
        x = sum(counts)
        xy_won = sum(counts[:y_won])
        xy_lost = sum(counts[y_won:])
        print(f"For `{features}`. PMI won: {pmi(x, y_won, xy_won, total):.3f}. {pmi(x, y_lost, xy_lost, total):.3f}")
#     print(f"For word {k}:")
#     print(f"PMI won: {pmi()}")