In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import reuters
from sklearn.decomposition import TruncatedSVD

**Example: Co-Occurrence with Fixed Window of n=1**:

Document 1: "all that glitters is not gold"

Document 2: "all is well that ends well"


|     *    | `<START>` | all | that | glitters | is   | not  | gold  | well | ends | `<END>` |
|----------|-------|-----|------|----------|------|------|-------|------|------|-----|
| `<START>`    | 0     | 2   | 0    | 0        | 0    | 0    | 0     | 0    | 0    | 0   |
| all      | 2     | 0   | 1    | 0        | 1    | 0    | 0     | 0    | 0    | 0   |
| that     | 0     | 1   | 0    | 1        | 0    | 0    | 0     | 1    | 1    | 0   |
| glitters | 0     | 0   | 1    | 0        | 1    | 0    | 0     | 0    | 0    | 0   |
| is       | 0     | 1   | 0    | 1        | 0    | 1    | 0     | 1    | 0    | 0   |
| not      | 0     | 0   | 0    | 0        | 1    | 0    | 1     | 0    | 0    | 0   |
| gold     | 0     | 0   | 0    | 0        | 0    | 1    | 0     | 0    | 0    | 1   |
| well     | 0     | 0   | 1    | 0        | 1    | 0    | 0     | 0    | 1    | 1   |
| ends     | 0     | 0   | 1    | 0        | 0    | 0    | 0     | 1    | 0    | 0   |
| `<END>`      | 0     | 0   | 0    | 0        | 0    | 0    | 1     | 1    | 0    | 0   |

In [None]:
START_TOKEN = "<START>"
END_TOKEN = "<END>"

documents = [
    [START_TOKEN] + "all that glitters is not gold".split() + [END_TOKEN],
    [START_TOKEN] + "all is well that ends well".split() + [END_TOKEN],
]

columns = [START_TOKEN] + "all that glitters is not gold well ends".split() + [END_TOKEN]
co_df = pd.DataFrame(0, index=columns, columns=columns)

for doc in documents:
    # boundary words
    start_token, start_word, *_, end_word, end_token = doc
    co_df.loc[start_token, start_word] += 1
    co_df.loc[end_token, end_word] += 1

    # center words
    for win in np.lib.stride_tricks.sliding_window_view(doc, window_shape=3):
        prefix_word, center_word, surfix_word = win
        co_df.loc[center_word, [prefix_word, surfix_word]] += 1
co_df

In [None]:
files = reuters.fileids("gold")
print(f"Number of files: {len(files)}")

corpus = START_TOKEN + " " + pd.Series(map(reuters.words, files)).apply(" ".join).str.lower() + " " + END_TOKEN
corpus.head()

In [None]:
total_words = corpus.str.split(expand=True).stack()
print(f"Number of total words: {len(total_words)}")
distinct_words = list(sorted(total_words.unique()))
print(f"Number of distinct words: {len(distinct_words)}")
total_words.value_counts().head(10)

In [None]:
window_size = 4
num_distinct_words = len(distinct_words)
word2index = {word: i for i, word in enumerate(distinct_words)}
co_occurrence_matrix = np.zeros((num_distinct_words, num_distinct_words))
for sentence in corpus.apply(lambda x: x.split()).to_list():
    for i, word in enumerate(sentence):
        for around in sentence[max(0, i - window_size) : i] + sentence[i + 1 : i + window_size + 1]:
            co_occurrence_matrix[word2index[word], word2index[around]] += 1
co_occurrence_matrix = pd.DataFrame(co_occurrence_matrix, index=distinct_words, columns=distinct_words, dtype=int)
co_occurrence_matrix.iloc[:, :10].head()

In [None]:
reduced_co_occurrence_matrix = TruncatedSVD(n_components=2, n_iter=10).fit_transform(co_occurrence_matrix.values)
reduced_co_occurrence_matrix = reduced_co_occurrence_matrix / np.linalg.norm(
    reduced_co_occurrence_matrix, axis=1, keepdims=True
)

words = [
    "value",
    "gold",
    "platinum",
    "reserves",
    "silver",
    "metals",
    "copper",
    "belgium",
    "australia",
    "china",
    "grammes",
    "mine",
]
indices = [word2index[word] for word in words]

plt.figure(figsize=(10, 10))
plt.scatter(reduced_co_occurrence_matrix[indices, 0], reduced_co_occurrence_matrix[indices, 1])
for i, word in zip(indices, words):
    plt.annotate(word, xy=(reduced_co_occurrence_matrix[i, 0], reduced_co_occurrence_matrix[i, 1]))
plt.show()