In [41]:
from __future__ import division
import itertools
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import scipy.sparse
import time
import string
from nltk.tokenize import regexp_tokenize, word_tokenize, sent_tokenize
from nltk.probability import FreqDist

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE as tsne

In [32]:
# Read input data from files
inp_eng = pd.read_csv('../data/CONcreTEXT_trial_EN.tsv',sep ='\t')


In [33]:
print(inp_eng)

         TARGET POS  INDEX                                               TEXT  \
0   achievement   N      3  Bring up academic achievements , awards , and ...   
1   achievement   N      9  Please list people you have helped , your pers...   
2      activate   V      1     Add activated carbon straight to your vodka .    
3      activate   V     15  Place sensors around your garden , and when a ...   
4     adventure   N      9  Look for a partner that shares your level of a...   
..          ...  ..    ...                                                ...   
95        water   N      5  Rinse your face with warm water and pat it dry .    
96          win   V      4  Staying mentally strong means winning half the...   
97          win   V      7  The person who has the highest score wins the ...   
98        woman   N      7  For the most part , men and women wear the sam...   
99        woman   N      3  Look at the woman whom you are listening to fo...   

    MEAN  
0   3.06  
1   3

In [42]:
# Function to tokenize words in a row
punct = list(string.punctuation)

def tokenize_speech(text):
    return [word for word in word_tokenize(text) if not word in punct]

In [43]:
inp_eng['Tokens'] = inp_eng.apply(lambda x: tokenize_speech(x['TEXT']),axis=1)

In [45]:
corpus = inp_eng['Tokens']
corpus

0     [Bring, up, academic, achievements, awards, an...
1     [Please, list, people, you, have, helped, your...
2     [Add, activated, carbon, straight, to, your, v...
3     [Place, sensors, around, your, garden, and, wh...
4     [Look, for, a, partner, that, shares, your, le...
                            ...                        
95    [Rinse, your, face, with, warm, water, and, pa...
96    [Staying, mentally, strong, means, winning, ha...
97    [The, person, who, has, the, highest, score, w...
98    [For, the, most, part, men, and, women, wear, ...
99    [Look, at, the, woman, whom, you, are, listeni...
Name: Tokens, Length: 100, dtype: object

In [54]:
#@title Utilities
import re
import time
import itertools
import numpy as np

# For pretty-printing
import pandas as pd
from IPython.display import display, HTML

UNK_TOKEN   = u"<unk>"

def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))

def pretty_print_matrix(M, rows=None, cols=None, dtype=float, float_fmt="{0:.04f}"):
    """Pretty-print a matrix using Pandas.

    Args:
      M : 2D numpy array
      rows : list of row labels
      cols : list of column labels
      dtype : data type (float or int)
      float_fmt : format specifier for floats
    """
    df = pd.DataFrame(M, index=rows, columns=cols, dtype=dtype)
    old_fmt_fn = pd.get_option('float_format')
    pd.set_option('float_format', lambda f: float_fmt.format(f))
    display(df)
    pd.set_option('float_format', old_fmt_fn)  # reset Pandas formatting

def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)


##
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

##
# Data loading functions
def get_corpus(name="brown"):
    import nltk
    assert(nltk.download(name))
    return nltk.corpus.__getattr__(name)

def build_vocab(corpus, V=10000):
    import vocabulary
    token_feed = (canonicalize_word(w) for w in corpus.words())
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab

def get_train_test_sents(corpus, split=0.8, shuffle=True):
    """Generate train/test split for unsupervised tasks.

    Args:
      corpus: nltk.corpus that supports sents() function
      split (double): fraction to use as training set
      shuffle (int or bool): seed for shuffle of input data, or False to just
      take the training data as the first xx% contiguously.

    Returns:
      train_sentences, test_sentences ( list(list(string)) ): the train and test
      splits
    """
    sentences = np.array(list(corpus.sents()), dtype=object)
    fmt = (len(sentences), sum(map(len, sentences)))

    if shuffle:
        rng = np.random.RandomState(shuffle)
        rng.shuffle(sentences)  # in-place
    train_frac = 0.8
    split_idx = int(train_frac * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    fmt = (len(train_sentences), sum(map(len, train_sentences)))
    fmt = (len(test_sentences), sum(map(len, test_sentences)))

    return train_sentences, test_sentences

def preprocess_sentences(sentences, vocab, use_eos=False, emit_ids=True):
    """Preprocess sentences by canonicalizing and mapping to ids.

    Args:
      sentences ( list(list(string)) ): input sentences
      vocab: Vocabulary object, already initialized
      use_eos: if true, will add </s> token to end of sentence.
      emit_ids: if true, will emit as ids. Otherwise, will be preprocessed
          tokens.

    Returns:
      ids ( array(int) ): flattened array of sentences, including boundary <s>
      tokens.
    """
    # Add sentence boundaries, canonicalize, and handle unknowns
    word_preproc = lambda w: canonicalize_word(w, wordset=vocab.word_to_id)
    ret = []
    for s in sentences:
        canonical_words = vocab.pad_sentence(list(map(word_preproc, s)),
                                             use_eos=use_eos)
        ret.extend(vocab.words_to_ids(canonical_words) if emit_ids else
                   canonical_words)
    if not use_eos:  # add additional <s> to end if needed
        ret.append(vocab.START_ID if emit_ids else vocab.START_TOKEN)
    return np.array(ret, dtype=(np.int32 if emit_ids else object))


def load_corpus(corpus, split=0.8, V=10000, shuffle=0):
    """Load a named corpus and split train/test along sentences.

    This is a convenience wrapper to chain together several functions from this
    module, and produce a train/test split suitable for input to most models.

    Sentences are preprocessed by canonicalization and converted to ids
    according to the constructed vocabulary, and interspersed with <s> tokens
    to denote sentence bounaries.

    Args:
        corpus: (string | corpus reader) If a string, will fetch the
            NLTK corpus of that name.
        split: (float \in (0,1]) fraction of examples in train split
        V: (int) vocabulary size (including special tokens)
        shuffle: (int) if > 0, use as random seed to shuffle sentence prior to
            split. Can change this to get different splits.

    Returns:
        (vocab, train_ids, test_ids)
        vocab: vocabulary.Vocabulary object
        train_ids: flat (1D) np.array(int) of ids
        test_ids: flat (1D) np.array(int) of ids
    """
    if isinstance(corpus, str):
        corpus = get_corpus(corpus)
    vocab = build_vocab(corpus, V)
    train_sentences, test_sentences = get_train_test_sents(corpus, split, shuffle)
    train_ids = preprocess_sentences(train_sentences, vocab)
    test_ids = preprocess_sentences(test_sentences, vocab)
    return vocab, train_ids, test_ids

##
# Window and batch functions
def rnnlm_batch_generator(ids, batch_size, max_time):
    """Convert ids to data-matrix form for RNN language modeling."""
    # Clip to multiple of max_time for convenience
    clip_len = ((len(ids)-1) / batch_size) * batch_size
    input_w = ids[:clip_len]     # current word
    target_y = ids[1:clip_len+1]  # next word
    # Reshape so we can select columns
    input_w = input_w.reshape([batch_size,-1])
    target_y = target_y.reshape([batch_size,-1])

    # Yield batches
    for i in xrange(0, input_w.shape[1], max_time):
        yield input_w[:,i:i+max_time], target_y[:,i:i+max_time]


def build_windows(ids, N, shuffle=True):
    """Build window input to the window model.

    Takes a sequence of ids, and returns a data matrix where each row
    is a window and target for the window model. For N=3:
        windows[i] = [w_3, w_2, w_1, w_0]

    For language modeling, N is the context size and you can use y = windows[:,-1]
    as the target words and x = windows[:,:-1] as the contexts.

    For CBOW, N is the window size and you can use y = windows[:,N/2] as the target words
    and x = np.hstack([windows[:,:N/2], windows[:,:N/2+1]]) as the contexts.

    For skip-gram, you can use x = windows[:,N/2] as the input words and y = windows[:,i]
    where i != N/2 as the target words.

    Args:
      ids: np.array(int32) of input ids
      shuffle: if true, will randomly shuffle the rows

    Returns:
      windows: np.array(int32) of shape [len(ids)-N, N+1]
        i.e. each row is a window, of length N+1
    """
    windows = np.zeros((len(ids)-N, N+1), dtype=int)
    for i in xrange(N+1):
        # First column: first word, etc.
        windows[:,i] = ids[i:len(ids)-(N-i)]
    if shuffle:
        # Shuffle rows
        np.random.shuffle(windows)
    return windows


def batch_generator(data, batch_size):
    """Generate minibatches from data.

    Args:
      data: array-like, supporting slicing along first dimension
      batch_size: int, batch size

    Yields:
      minibatches of maximum size batch_size
    """
    for i in xrange(0, len(data), batch_size):
        yield data[i:i+batch_size]

In [55]:
# Check issue with following code
#    print "Loaded {:,} sentences ({:g} tokens)".format(*fmt)

In [56]:
#@title Vocabulary helper functions
import collections
from collections import defaultdict

class Vocabulary(object):

  START_TOKEN = u"<s>"
  END_TOKEN   = u"</s>"
  UNK_TOKEN   = u"<unk>"

  def __init__(self, tokens, size=None):
    """Create a Vocabulary object.

    Args:
        tokens: iterator( string )
        size: None for unlimited, or int > 0 for a fixed-size vocab.
              Vocabulary size includes special tokens <s>, </s>, and <unk>
    """
    self.unigram_counts = collections.Counter(tokens)
    self.bigram_counts = defaultdict(lambda: defaultdict(lambda: 0))
    word1 = None
    for word in tokens:
        if word1 is None:
            pass
        self.bigram_counts[word1][word] += 1
        word1 = word
    self.bigram_counts.default_factory = None  # make into a normal dict

    # Leave space for "<s>", "</s>", and "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 3))
    vocab = ([self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.items()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.keys())

    # Store special IDs
    self.START_ID = self.word_to_id[self.START_TOKEN]
    self.END_ID = self.word_to_id[self.END_TOKEN]
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def pad_sentence(self, words, use_eos=True):
    ret = [self.START_TOKEN] + words
    if use_eos:
      ret.append(self.END_TOKEN)
    return ret

  def sentence_to_ids(self, words, use_eos=True):
    return self.words_to_ids(self.pad_sentence(words, use_eos))

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))

In [57]:
#@title TSV Corpus Reader
import sys, os

class TSVCorpusReader(object):
    """Corpus reader for TSV files.

    Input files are assumed to contain one sentence per line, with tokens
    separated by tabs:

    foo[tab]bar[tab]baz
    span[tab]eggs

    Would correspond to the two-sentence corpus:
        ["foo", "bar", "baz"],
        ["spam", "eggs"]

    """

    def __init__(self, sentence_file, preload=True, file_reader=open):
        """Construct a corpus reader for the given file.

        Args:
            sentence_file: (string) path to a TSV file with one sentence per
                line.
            preload: (bool) If true, will read entire corpus to memory on
                construction. Otherwise, will load on-demand.
            file_reader: (function string -> fd) optional replacement for
                Python's built-in open(...) method, to be used for reading
                from alternative file-like objects.
        """
        self._open = file_reader
        self._sentence_file = sentence_file
        self._sentence_cache = []

        if preload:
            self._sentence_cache = list(self.sents())

    def _line_iterator(self):
        with self._open(self._sentence_file) as fd:
            for line in fd:
                yield line.strip()

    def sents(self):
        """Iterator over sentences in the corpus.

        Yields:
            list(string) of tokens
        """
        if self._sentence_cache:
            for sentence in self._sentence_cache:
                yield sentence
        else:
            # If no cache, actually read the file.
            for line in self._line_iterator():
                yield line.split("\t")

    def words(self):
        """Iterator over words in the corpus.

        Yields:
            (string) tokens
        """
        for sentence in self.sents():
            for word in sentence:
                yield word

## Creating the vocabulary

Let's now get started with creating the vocabulary. We'll use some of the functions defined in the utility classes we just loaded above.

(Note: the following code cell may take 20-30 seconds to complete running.)

In [58]:
# Create a vocabulary by first canonicalizing all the words -- lowercasing
# and converting all digits to a single string. The vocabulary maintains a
# mapping between words and integer ids.
vocab = Vocabulary(canonicalize_word(w)
                   for w in flatten(corpus))
print("Vocabulary: {:,} words".format(vocab.size))

# Turn the corpus into a single flattened list of tokens, where each sentence
# begins with a special marker <s>.
tokens = preprocess_sentences(corpus, vocab, use_eos=False, emit_ids=False)
print("Corpus: {:,} tokens (counting <s>)".format(len(tokens)))

# Retrieve the ids corresponding to the tokens (above). This is the data
# we'll actually use.
token_ids = vocab.words_to_ids(tokens)
print('Sample words:', tokens[:10])
print('Sample ids:', token_ids[:5])

Vocabulary: 648 words
Corpus: 1,416 tokens (counting <s>)
Sample words: ['<s>' 'bring' 'up' 'academic' 'achievements' 'awards' 'and' 'other'
 'milestones' 'in']
Sample ids: [0, 79, 80, 171, 81]


In [None]:
# A function that produces a sparse co-occurrence matrix given a corpus,
# a vocabulary size V, and K (the context window is +-K).


In [59]:
# A function that produces a sparse co-occurrence matrix given a corpus,
# a vocabulary size V, and K (the context window is +-K).
def co_occurrence_matrix(token_ids, V, K=2):
    # We'll use this as an "accumulator" matrix.
    C = scipy.sparse.csc_matrix((V,V), dtype=np.float32)

    for k in range(1, K+1):
        print(u'Counting pairs (i, i \u00B1 %d) ...' %k)
        i = token_ids[:-k]  # current word
        j = token_ids[k:]   # k words ahead
        data = (np.ones_like(i), (i,j))  # values, indices
        Ck_plus = scipy.sparse.coo_matrix(data, shape=C.shape, dtype=np.float32)
        Ck_plus = scipy.sparse.csc_matrix(Ck_plus)
        Ck_minus = Ck_plus.T  # consider k words behind
        C += Ck_plus + Ck_minus

    print("Co-occurrence matrix: %d words x %d words" %C.shape)
    print("%.02g nonzero elements" %C.nnz)
    return C

In [67]:
# Build the co-occurrence matrix.
vec_C = co_occurrence_matrix(token_ids, vocab.size, K=3)

# Display a table with the counts. The .toarray() function converts the
# sparse matrix into a dense one.
vec_labels = vocab.ordered_words()
pretty_print_matrix(vec_C.toarray(), rows=vec_labels,
                    cols=vec_labels, dtype=int)

Counting pairs (i, i ± 1) ...
Counting pairs (i, i ± 2) ...
Counting pairs (i, i ± 3) ...
Co-occurrence matrix: 648 words x 648 words
7.1e+03 nonzero elements


Unnamed: 0,<s>,</s>,<unk>,the,you,a,to,your,and,of,...,wins,men,women,wear,same,shoes,woman,whom,she,speaking
<s>,0,0,0,31,22,20,9,19,9,17,...,1,0,0,0,0,1,0,0,1,1
</s>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
<unk>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
the,31,0,0,8,11,4,13,10,4,14,...,2,1,1,1,1,0,1,1,0,0
you,22,0,0,11,2,5,6,2,4,2,...,0,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shoes,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
woman,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
whom,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
she,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### Q-1 Write a function to compute the PPMI matrix, which is a n-by-n matrix where each element is the PPMI value between two distinct words. 
#### Test your function using the first 100 sentences of the English language data from our class data files. 

In [68]:
def PPMI(C):
    """Tranform a counts matrix to PPMI.
    
    Args:
      C: scipy.sparse.csc_matrix of counts C_ij
    
    Returns:
      (scipy.sparse.csc_matrix) PPMI(C) as defined above
    """
    # Total count.
    Z = float(C.sum())

    # Sum each row (along columns).
    Zr = np.array(C.sum(axis=1), dtype=np.float64).flatten()
    
    # Get indices of relevant elements.
    ii, jj = C.nonzero()  # row, column indices
    Cij = np.array(C[ii,jj], dtype=np.float64).flatten()
    
    # PMI equation.
    pmi = np.log(Cij * Z / (Zr[ii] * Zr[jj]))

    # Truncate to positive only.
    ppmi = np.maximum(0, pmi)  # take positive only
    
    # Re-format as sparse matrix.
    ret = scipy.sparse.csc_matrix((ppmi, (ii,jj)), shape=C.shape,
                                  dtype=np.float64)
    ret.eliminate_zeros()  # remove zeros
    return ret

# Display the PPMI'd version of the co-occurrence matrix.
pretty_print_matrix(PPMI(vec_C).toarray(), rows=vec_labels, 
                    cols=vec_labels, dtype=float)

Unnamed: 0,<s>,</s>,<unk>,the,you,a,to,your,and,of,...,wins,men,women,wear,same,shoes,woman,whom,she,speaking
<s>,0.0000,0.0000,0.0000,0.1481,0.1416,0.1894,0.0000,0.2463,0.0000,0.3232,...,0.8572,0.0000,0.0000,0.0000,0.0000,0.8572,0.0000,0.0000,0.8572,1.2627
</s>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
<unk>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
the,0.1481,0.0000,0.0000,0.0000,0.0000,0.0000,0.3007,0.0665,0.0000,0.5910,...,2.0124,1.3193,1.3193,1.3193,1.3193,0.0000,1.3193,1.3193,0.0000,0.0000
you,0.1416,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.6558,1.6558,1.6558,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shoes,0.8572,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2.0951,...,0.0000,0.0000,0.0000,0.0000,5.4624,0.0000,0.0000,0.0000,0.0000,0.0000
woman,0.0000,0.0000,0.0000,1.3193,1.6558,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,5.4624,0.0000,0.0000
whom,0.0000,0.0000,0.0000,1.3193,1.6558,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,5.4624,0.0000,0.0000,0.0000
she,0.8572,0.0000,0.0000,0.0000,1.6558,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,5.8679


#### Q-2) Briefly describe the algorithm for forming the PPMI matrix. What is the time complexity of your algorithm? Write at least 50 words.

Ans :- Firstly I have created a matrix of word with co-occurences with a distance of K. In the above solution, I have used K=3. 
       1) Calculate total number of tokens in the corpus(Z)<br>
       2) Calculate sum of rows accross all columns in the matrix. <br>
       3) Get the indices of relevant non zero elements. <br>
       4) Compute PMI for all elements in the matrix using following formula :- <br>
          PMI(𝑖,𝑗)=log𝑃(𝑖,𝑗)𝑃(𝑖)𝑃(𝑗)=log𝐶𝑖𝑗⋅𝑍𝑍𝑖⋅𝑍𝑗 <br>
       5) We consider all non zero values , this we do using a max function i.e. if PMI is greater than 0, we will consider teh PMI value, whereas if 
          calculated PMI is less than 0 then we consider 0 for such values. <br>
          PPMI(𝑖,𝑗)=max(0,PMI(𝑖,𝑗))  --- Logic for ignoring negatively-correlated pairs <br>
          
Time complexity is the computational complexity that describes the amount of iterations it takes to run PPMI algorithm.
In above solution vec_C is 2-D matrix of 648 rows × 648 columns hence, the function will be looped 648*648 times.

Hence, the time complexity can be calculated as O(648 * 648).       

### Q-3) How would you test if the representation of the words in your PPMI matrix reflects some fact about the relationship between words in the real world? 
###      For example, if two words are expected to co-occur together a lot, the PPMI value should be high (and vice versa). 
###      Write at least 50 words in your answer and give at least 2 pairs of examples from your PPMI matrix.

#### Ans:- In english language we know that usually Subject and verbs are plaecd closed to each other. <br>
#### In english generally, sentence starts with Noun/Pronoun or prepositions like "The". <br>

#### Now I will compare the PPMI values for various pairs of words. <br>
#### Firstly, PMI value for pair of Noun and other part of speech and I observe that the PPMI value is high:- <br>
PMMI value for (Shoes,of) is ::  2.0951 <br>
PMMI value for (Shoes,same) is ::  5.4624 <br>
PMMI value for (she,speaking) is ::  5.8679 <br>

#### PMMI value for pair of Subjects or subject,object and I observe that the PPMI value is low :- <br>
PMMI value for (Shoes, men) is ::  0 <br>
PMMI value for (shoes,she) is ::  0 <br>

#### Now I will compare PPMI value for pair of "start of line" and Noun/Pronoun and I observe that the PPMI value is high <br>
PMMI value for ("start of line",she) is :: 0.8572 <br>
PMMI value for ("start of line",shoes) is :: 0.8572 <br>

#### Now I will compare PPMI value for pair of "start of line" and conjunction/prepositions and I observe that the PPMI value is low <br> 
PMMI value for ("start of line",and) is :: 0 <br>
PMMI value for ("start of line",to) is :: 0 <br>

#### From above examples I observed that PPMI value is high between words which tend to be closer to eachother and PPMI is low between wordswhich tend to be far away from each other as per the generic english rules. <br>

#### With this I can conclude that PPMI matrix generated reflects the relationship between words in english language.

### Q-4) Repeat Question 1 for the first 100 sentences in the Italian language data file.

In [69]:
inp_itl = pd.read_csv('../data/CONcreTEXT_trial_IT.tsv',sep='\t')
inp_itl['Tokens'] = inp_itl.apply(lambda x: tokenize_speech(x['TEXT']),axis=1)
corpus_itl = inp_itl['Tokens']

In [71]:
# Create a vocabulary by first canonicalizing all the words -- lowercasing
# and converting all digits to a single string. The vocabulary maintains a
# mapping between words and integer ids.
vocab_itl = Vocabulary(canonicalize_word(w)
                   for w in flatten(corpus_itl))
print("Vocabulary: {:,} words".format(vocab_itl.size))

# Turn the corpus into a single flattened list of tokens, where each sentence
# begins with a special marker <s>.
tokens = preprocess_sentences(corpus_itl, vocab_itl, use_eos=False, emit_ids=False)
print("Corpus: {:,} tokens (counting <s>)".format(len(tokens)))

# Retrieve the ids corresponding to the tokens (above). This is the data
# we'll actually use.
token_ids = vocab_itl.words_to_ids(tokens)
print('Sample words:', tokens[:10])
print('Sample ids:', token_ids[:5])

Vocabulary: 714 words
Corpus: 1,411 tokens (counting <s>)
Sample words: ['<s>' 'guardati' 'i' 'piedi' 'o' 'fai' 'finta' 'di' 'essere'
 'affascinata']
Sample ids: [0, 151, 11, 152, 20]


In [72]:
# Build the co-occurrence matrix.
vec_itl = co_occurrence_matrix(token_ids, vocab_itl.size, K=3)

# Display a table with the counts. The .toarray() function converts the
# sparse matrix into a dense one.
vec_labels = vocab_itl.ordered_words()
pretty_print_matrix(vec_itl.toarray(), rows=vec_labels,
                    cols=vec_labels, dtype=int)

Counting pairs (i, i ± 1) ...
Counting pairs (i, i ± 2) ...
Counting pairs (i, i ± 3) ...
Co-occurrence matrix: 714 words x 714 words
7.4e+03 nonzero elements


Unnamed: 0,<s>,</s>,<unk>,di,e,la,un,in,il,per,...,propria,felicità,conigli,hanno,ottimo,udito,ottima,individuare,predatori,facilmente
<s>,0,0,0,12,11,20,13,10,11,7,...,1,1,1,1,0,0,0,0,1,1
</s>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
<unk>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
di,12,0,0,8,9,8,8,6,4,4,...,1,0,0,0,0,0,0,0,0,0
e,11,0,0,9,0,5,4,1,0,0,...,0,0,0,0,1,1,2,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
udito,0,0,0,0,1,0,2,0,0,0,...,0,0,0,1,1,0,1,0,0,0
ottima,0,0,0,0,2,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
individuare,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
predatori,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [73]:
# Display the PPMI'd version of the co-occurrence matrix.
pretty_print_matrix(PPMI(vec_itl).toarray(), rows=vec_labels, 
                    cols=vec_labels, dtype=float)

Unnamed: 0,<s>,</s>,<unk>,di,e,la,un,in,il,per,...,propria,felicità,conigli,hanno,ottimo,udito,ottima,individuare,predatori,facilmente
<s>,0.0000,0.0000,0.0000,0.0000,0.0000,0.4155,0.0865,0.1118,0.2071,0.0000,...,0.8537,0.8537,0.8537,0.8537,0.0000,0.0000,0.0000,0.0000,1.0360,1.2592
</s>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
<unk>,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
di,0.0000,0.0000,0.0000,0.0000,0.2278,0.1725,0.2743,0.2743,0.0000,0.0000,...,1.5271,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
e,0.0000,0.0000,0.0000,0.2278,0.0000,0.1378,0.0165,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,1.9624,1.9624,2.6555,1.9624,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
udito,0.0000,0.0000,0.0000,0.0000,1.9624,0.0000,2.8198,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,5.4589,5.4589,0.0000,5.4589,0.0000,0.0000,0.0000
ottima,0.0000,0.0000,0.0000,0.0000,2.6555,0.0000,2.1267,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,5.4589,0.0000,0.0000,0.0000,0.0000
individuare,0.0000,0.0000,0.0000,0.0000,1.9624,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,5.6412,5.8643
predatori,1.0360,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,5.6412,0.0000,6.0467
