#### Importing the libraries

In [87]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
import nltk
from collections import defaultdict

----------------------------

#### Processing the data

In [164]:
# Here I have removed the space occured in the words having apostrophe using replace
english = pd.read_csv("CONcreTEXT_trial_EN.tsv", sep='\t').replace(" '","'", regex=True).replace(" ’","’", regex=True)
tokenizer = RegexpTokenizer(r"\w+")
englistText=[]
for str in english["TEXT"]:
    temp=tokenizer.tokenize(str.lower())
    temp1=' '.join(temp)
    englistText.append(temp1)

#### Question 1: Write a function to compute the PPMI matrix, which is a n-by-n matrix where each element is the PPMI value between two distinct words. Test your function using the first 100 sentences of the English language data from our class data files. 

In [103]:
def co_occurance_matrix(sent, windowSize):
    d = defaultdict(int)
    vocab = set()
    for text in sent:
        text = text.lower().split()
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)
            next_token = text[i+1 : i+1+windowSize]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1

    vocab = sorted(vocab)
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

In [151]:
english_df = co_occurance_matrix(englishText, 3)

In [116]:
def generate_ppmi(df, positive=True):
    column_totals = df.sum(axis=0)
    total = column_totals.sum()
    row_totals = df.sum(axis=1)
    expected = np.outer(row_totals, column_totals) / total
    df = df / expected
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = 0.0
    if positive:
        df[df < 0] = 0.0
    return df

In [155]:
english_ppmi = generate_ppmi(english_df, positive=True)

In [162]:
english_ppmi

Unnamed: 0,",",-,.,30,:,a,about,academic,achievements,across,...,wrist,years,you,you'll,you're,you've,your,yourself,you’re,you’ve
",",0.000000,0.0,0.000000,0.0,1.573577,0.037905,0.000000,1.573577,2.084402,0.0,...,0.000000,0.00000,0.000000,1.391255,0.0,0.698108,0.075070,0.00000,0.0,1.391255
-,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
.,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,1.606367,1.82951,0.129825,0.000000,0.0,0.000000,0.225643,1.82951,0.0,0.000000
30,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
:,1.573577,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
you've,0.698108,0.0,0.000000,0.0,0.000000,0.000000,3.138843,0.000000,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
your,0.075070,0.0,0.225643,0.0,0.000000,0.000000,1.534977,0.000000,1.247294,0.0,...,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
yourself,0.000000,0.0,1.829510,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
you’re,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000


#### Question 2:  Briefly describe the algorithm for forming the PPMI matrix. What is the time complexity of your algorithm? Write at least 50 words.

We first create a function which generates cooccurance matrix by taking the dataset and windowsize as the parameters. Then we initialize a dict and vocab to store the token counts and distinct tokens respectively. Then we iterate over the sentences(received as a parameter) and store the tokens and vocab. Then we formulate the dictionary in the dataframe using "DataFrame" function and generate the dataframe and return it back as the answer to the function created.
After we generate the cooccurance matric, we then pass it to a new function to generate the ppmi. First we get the rows and columns required for the ppmi, and generate the expected count using "Numpy.outer" function. Then we check if the value if infinity using "Numpy.isinf" function and result it to 0 if found.
Then we print the obtained PPMI matrix. 
The time complexity of the algorithm is O(n^m) where n is the number of sentences and m is the maximum length of a sentence

#### Question 3:  How would you test if the representation of the words in your PPMI matrix reflects some fact about the relationship between words in the real world? For example, if two words are expected to co-occur together a lot, the PPMI value should be high (and vice versa). 

I would like to take two sets of words "Big" and "Car", as both of them appear a lot, their ppmi is high. 
On the opposite side of it I would like to take set of words "your" and "academic", both have the lowst ppmi as both of them does not co-occur in the corpus. 
Even though the words likely tend to occur in real world, the training data set doesn't have this sentences with this two words as expected. So may be by taking enormous amount of data, we might get the output as expected, rather that what we are obtaining results now.

#### Question 4:  Repeat Question 1 for the first 100 sentences in the Italian language data file.

In [145]:
italian = pd.read_csv("CONcreTEXT_trial_IT.tsv", sep='\t').replace(" '","'", regex=True).replace(" ’","’", regex=True)
tokenizer = RegexpTokenizer(r"\w+")
italianText=[]
for str in italian["TEXT"]:
    temp=tokenizer.tokenize(str.lower())
    temp1=' '.join(temp)
    italianText.append(temp1)

In [146]:
def co_occurance_matrix(sent, windowSize):
    d = defaultdict(int)
    vocab = set()
    for text in sent:
        text = text.lower().split()
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)
            next_token = text[i+1 : i+1+windowSize]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1

    vocab = sorted(vocab)
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df



In [157]:
italian_df = co_occurance_matrix(italianText, 3)

In [148]:
def generate_ppmi(df, positive=True):
    column_totals = df.sum(axis=0)
    total = column_totals.sum()
    row_totals = df.sum(axis=1)
    expected = np.outer(row_totals, column_totals) / total
    df = df / expected
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = 0.0
    if positive:
        df[df < 0] = 0.0
    return df

In [158]:
italian_ppmi = generate_ppmi(italian_df, positive=True)

In [159]:
italian_ppmi

Unnamed: 0,125,250,30,a,abilità,abito,abitudinario,accendi,accesso,accettare,...,vita,vivi,vodka,volpe,volta,volte,vuoi,vuole,zentangle,è
125,0.000000,5.214936,0.0,2.529358,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,2.495836
250,5.214936,0.000000,0.0,2.529358,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
30,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
a,2.529358,2.529358,0.0,0.000000,0.0,0.000000,0.000000,0.0,2.529358,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
abilità,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,4.991792,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
volte,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
vuoi,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
vuole,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
zentangle,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
