#### 1.Generate Word Context Matrix for a given corpus on a given window size.

In [49]:
import numpy as np 
import pandas as pd 
# Downloads the data.
import nltk
import re
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Using the stopwords.
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')

lemmatizer=WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vicky_gupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vicky_gupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
file=open("corpus.txt","r")
corpus=file.read()
file.close()
print(corpus)

The cat sat on the mat.
She opened the door and let the dog in.
Birds chirped in the trees.
The sun set behind the mountains.
He walked along the beach and listened to the waves.
Children played in the park.
She cooked dinner in the kitchen.
The rain pattered against the window.
The old house creaked in the wind.
He read a book by the fireplace.


#### Preprocessing

In [51]:
def preprocessing(x):
    # Convert to lower case
    x=x.lower() 
    # Remove special characters
    x=re.sub("[^a-zA-Z]"," ",x) 
    # Split the sentence into words
    x=x.split() 
    # Lemmatize the words and remove the stopwords
    x=[lemmatizer.lemmatize(word) for word in x if word not in stoplist]
    return x

In [52]:
def generate_word_context_matrix(corpus, window_size):
    word_context_matrix = {}
    
    for i in range(len(corpus)):
        word = corpus[i]
        
        for j in range(max(0, i - window_size), min(len(corpus), i + window_size + 1)):
            if i != j:
                context_word = corpus[j]
                
                if word not in word_context_matrix:
                    word_context_matrix[word] = {}
                
                if context_word not in word_context_matrix[word]:
                    word_context_matrix[word][context_word] = 0
                
                word_context_matrix[word][context_word] += 1
    
    return word_context_matrix




In [53]:
corpus=preprocessing(corpus)
word_context_matrix=generate_word_context_matrix(corpus, 2)

# Convert the word context matrix to a DataFrame
df = pd.DataFrame(word_context_matrix)

# Fill NaN values with 0
df = df.fillna(0)

print(df)

           cat  sat  mat  opened  door  let  dog  bird  chirped  tree  ...  \
sat        1.0  0.0  1.0     1.0   0.0  0.0  0.0   0.0      0.0   0.0  ...   
mat        1.0  1.0  0.0     1.0   1.0  0.0  0.0   0.0      0.0   0.0  ...   
cat        0.0  1.0  1.0     0.0   0.0  0.0  0.0   0.0      0.0   0.0  ...   
opened     0.0  1.0  1.0     0.0   1.0  1.0  0.0   0.0      0.0   0.0  ...   
door       0.0  0.0  1.0     1.0   0.0  1.0  1.0   0.0      0.0   0.0  ...   
let        0.0  0.0  0.0     1.0   1.0  0.0  1.0   1.0      0.0   0.0  ...   
dog        0.0  0.0  0.0     0.0   1.0  1.0  0.0   1.0      1.0   0.0  ...   
bird       0.0  0.0  0.0     0.0   0.0  1.0  1.0   0.0      1.0   1.0  ...   
chirped    0.0  0.0  0.0     0.0   0.0  0.0  1.0   1.0      0.0   1.0  ...   
tree       0.0  0.0  0.0     0.0   0.0  0.0  0.0   1.0      1.0   0.0  ...   
sun        0.0  0.0  0.0     0.0   0.0  0.0  0.0   0.0      1.0   1.0  ...   
set        0.0  0.0  0.0     0.0   0.0  0.0  0.0   0.0      0.0 

#### 2.Find similarity between two words using word context matrix.

In [54]:
def cosine_similarity(word1, word2, word_context_matrix):
    if word1 not in word_context_matrix or word2 not in word_context_matrix:
        return 0.0
    
    vector1 = np.array(list(word_context_matrix[word1].values))
    vector2 = np.array(list(word_context_matrix[word2].values))
    
    dot_product = np.dot(vector1, vector2)
    norm_product = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    
    similarity = dot_product / norm_product
    return similarity


In [55]:
print(cosine_similarity("cat", "mat", df))

0.35355339059327373
