Introduction to NLP course (2017-2018).

Notebook 6: Co-occurrence matrices. DISSECT.

by Venelin Kovatchev, University of Barcelona


In [1]:
# Import section
import nltk
from nltk.corpus import gutenberg
from nltk import FreqDist
from nltk.collocations import *
import re
from collections import Counter
import numpy as np
import operator
from scipy import spatial

We will explore a simple co-occurrence Distributional Semantic Model.

It is based on surface co-occurrence within a window of 3 in the gutenberg corpus (see also Notebook 3).

First, we obtain the co-occurrence statistics from the corpus. Then we convert and save them in format that is native to the DISSECT library. We use the DISSECT library to load the files, generate a co-occurrence matrix and perform simple operations.

In [None]:
## Load the corpus
corpus = gutenberg.words()

## generate the raw co-occurrence count within a window of 3
cooc = BigramCollocationFinder.from_words(corpus,window_size=4).ngram_fd.items()

## convert the list of collocates in a dictionary

# Initialize the dict
cooc_dict = {}

# Loop through the list
for pair,freq in cooc:
    # Check and initialie the variables
    word1,word2 = pair
    # Check if entries for the words exist
    # If not, create them
    if word1 not in cooc_dict:
        cooc_dict[word1]={}
        
    if word2 not in cooc_dict:
        cooc_dict[word2]={}
        
    # Check if entries for the particular combination exists
    # If not, initialize them
    if word2 not in cooc_dict[word1]:
        cooc_dict[word1][word2]=0
    if word1 not in cooc_dict[word2]:
        cooc_dict[word2][word1]=0
    # Update the dict variables
    cooc_dict[word1][word2]+=freq
    cooc_dict[word2][word1]+=freq

## Generate the row, col and data variables for the DISSECT

# Initialize the variables
rows = []
cols = []
data = []

# Loop through the dictionary
for word_1 in cooc_dict:
    # Add an entry to the rows variable
    # there should be no duplications, but we check anyway
    if word_1 not in rows:
        rows.append(word_1)
    # Loop through the entries in the dict
    for word_2 in cooc_dict[word_1]:
        # Add an entry in the cols, if it's not already added
        if word_2 not in cols:
            cols.append(word_2)
        # Add the value to the data
        data.append(word_1 + " " + word_2 + " " + str(cooc_dict[word_1][word_2]))
        
## Output the row,col,data to files

# Define the base name
fname = "gutenberg_surface_3"

# Generate tuples of fname data for the files
out = []
out.append((fname + ".rows",rows))
out.append((fname + ".cols",cols))
out.append((fname + ".sm",data))

# Loop through the out var
for (filename,content) in out:
    # Open the file
    with open(filename,"w") as out_file:
        # Loop through the rows variable
        for entry in content:
            # Remove non unicode chars
            entry = entry.encode('utf8', 'replace')
            # Write the entry
            out_file.write(entry)
            # Add newline
            out_file.write("\n")


In [2]:
# Import section for dissect
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.dim_reduction.svd import Svd
from composes.similarity.cos import CosSimilarity
from composes.utils import scoring_utils

Load the three input files and creating the raw space.

In [3]:
# Path to the folder where the data files are
my_path = ""

# Loading the matrix from the three different files
my_space = Space.build(data = my_path + "gutenberg_surface_3.sm",
                       rows = my_path + "gutenberg_surface_3.rows",
                       cols = my_path + "gutenberg_surface_3.cols",
                       format = "sm")

Progress...1000000
Progress...2000000
Progress...3000000


Transform the space using PMI and SVD.

In [4]:
# Transforming the semantic space using PPMI
my_ppmi_space = my_space.apply(PpmiWeighting())

# Reducing dimensions
my_svd_space = my_space.apply(Svd(50))
my_ppmi_svd_space = my_ppmi_space.apply(Svd(50))

Basic operations with the co-occurrence vector space.

In [6]:
# Comparing similarity between "man" and "woman"
print "Calcilating similarity between man and woman"
print "PPMI and SVD matrix",my_ppmi_svd_space.get_sim("man", "woman", CosSimilarity())

# Comparing the 5 most similar words to "car"
print "Obtaining the 5 most similar words to 'car'"
print "PPMI and SVD matrix\n",my_ppmi_svd_space.get_neighbours("car", 5, CosSimilarity())

# Comparing the similarity with "gold standard"
print "Comparing similarity with 'gold standard'"
fname = my_path + "synonyms.txt"
# Load the pairs
word_pairs = io_utils.read_tuple_list(fname, fields=[0,1])
# Load the score
gold = io_utils.read_list(fname, field=2)
# Predict similarity
predicted_ppmi_svd = [round(sim,2) for sim in my_ppmi_svd_space.get_sims(word_pairs, CosSimilarity())]
print "Pairs:",word_pairs
print "Gold scores",gold
print "\n PPMI and SVD matrix:"
print "Predicted scores",predicted_ppmi_svd
print "Spearman correlation:",scoring_utils.score(gold, predicted_ppmi_svd, "spearman")
print "Pearson correlation:",scoring_utils.score(gold, predicted_ppmi_svd, "pearson")


Calcilating similarity between man and woman
PPMI and SVD matrix 0.783663262385
Obtaining the 5 most similar words to 'car'
PPMI and SVD matrix
[('car', 1.0), ('stick', 0.88739669849128044), ('window', 0.88371037927544349), ('lawn', 0.8766521928187585), ('corner', 0.8755986796085643)]
Comparing similarity with 'gold standard'
Pairs: [('awful', 'terrible'), ('awful', 'great'), ('awful', 'fast'), ('chop', 'cut'), ('chop', 'bake'), ('chop', 'smile'), ('material', 'fabric'), ('material', 'car'), ('material', 'stone')]
Gold scores ['10', '8', '5', '10', '7', '2', '10', '3', '7']

 PPMI and SVD matrix:
Predicted scores [0.85, 0.48, 0.49, 0.42, 0.33, 0.21, 0.31, 0.32, 0.33]
Spearman correlation: 0.418864620531
Pearson correlation: 0.518846556776
