In [1]:
import os
import glob
import gensim
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from tqdm import tqdm
import networkx as nx
from pyvis.network import Network
from gensim.models.callbacks import PerplexityMetric, CallbackAny2Vec
import matplotlib.pyplot as plt

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/gpu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/gpu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Extract sentences containing the word 'pavement' to create new corpus files

In [8]:
def extract_sentences_with_word(file_path, target_word):
    sentences_with_word = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = line.strip().split()  
            sentence = ' '.join(tokens) 
            if target_word in sentence:
                sentences_with_word.append(sentence)
    return sentences_with_word

def create_new_files(input_folder, output_folder, target_word):
    files = glob.glob(os.path.join(input_folder, '*.txt'))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_path in files:
        sentences_with_word = extract_sentences_with_word(file_path, target_word)
        
        if sentences_with_word:
            new_file_path = os.path.join(output_folder, os.path.basename(file_path)[:-4] + f'_containing_{target_word}.txt')
            
            with open(new_file_path, 'w', encoding='utf-8') as new_file:
                for sentence in sentences_with_word:
                    new_file.write(sentence + '\n')

if __name__ == "__main__":
    input_corpus_folder = "Corpus_Thesaurus_textfiles/corpus_number_removed"
    output_corpus_folder = "Val_Textfiles/val_corpus/corpus_containing_pavement"
    target_word = "pavement"  

    create_new_files(input_corpus_folder, output_corpus_folder, target_word)


In [6]:
stop_words = set(stopwords.words('english'))

synonyms = {}

corpus_folder_path = 'Val_Textfiles/val_corpus/corpus_containing_pavement'

corpus_sentences = []

for file_name in os.listdir(corpus_folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(corpus_folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences = nltk.sent_tokenize(text) 
            for sentence in sentences:
                words = nltk.word_tokenize(sentence.lower())
                words = [word for word in words if word not in stop_words] 
                corpus_sentences.append(words)

model = gensim.models.Word2Vec(corpus_sentences, vector_size=200, window=5, 
                               min_count=3, workers=4, sg=1, epochs=300)

epochs = 300
for epoch in tqdm(range(epochs), desc="Training Progress"):
    model.train(corpus_sentences, total_examples=model.corpus_count, epochs=1)

for word in model.wv.index_to_key:
    similar_words = model.wv.most_similar(word, topn=10)
    synonyms[word] = [(similar_word[0], similar_word[1]) for similar_word in similar_words]

output_file = 'd_r_thesaurus_containing_pavement_500.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    for word in synonyms:
        similar_words = synonyms[word]
        line = f'{word}: ' + ', '.join([f'{similar_word[0]}({similar_word[1]:.2f})' for similar_word in similar_words])
        file.write(line + '\n')

Training Progress: 100%|██████████| 300/300 [04:12<00:00,  1.19it/s]


In [14]:
embedding_thesaurus_file = 'Val_Textfiles/val_thesaurus/d_r_thesaurus_containing_pavement_300.txt'

new_embedding_thesaurus_file = 'd_r_thesaurus_containing_pavement_300_0.5.txt'
selected_entries = [] 

with open(embedding_thesaurus_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        word, synonyms_str = line.split(': ', 1)
        synonyms_list = synonyms_str.split(', ')
        selected_synonyms = []
        for synonym in synonyms_list:
            synonym_word, similarity = synonym.split('(')
            similarity = float(similarity[:-1])
            if similarity >= 0.5:
                selected_synonyms.append(f"{synonym_word}({similarity})")
        if selected_synonyms:
            selected_entries.append(f"{word}: {', '.join(selected_synonyms)}")

with open(new_embedding_thesaurus_file, 'w', encoding='utf-8') as file:
    for entry in selected_entries:
        file.write(entry + '\n')

In [16]:
thesaurus_file = 'd_r_thesaurus_containing_pavement_300_0.5.txt'

G = nx.Graph()

with open(thesaurus_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        word, synonyms_str = line.split(': ', 1)
        synonyms_list = synonyms_str.split(', ')
        G.add_node(word)  
        for synonym in synonyms_list:
            synonym_word, similarity = synonym.split('(')
            similarity = float(similarity[:-1])
            G.add_edge(word, synonym_word, weight=similarity)  

layout = nx.spring_layout(G, seed=42)

nt = Network(notebook=True, height='800px', width='100%')

for node in G.nodes():
    nt.add_node(node, label=node, title=node)

for edge in G.edges():
    source, target, weight = edge[0], edge[1], G.edges[edge]['weight']
    nt.add_edge(source, target, value=weight, title=f'Weight: {weight}')

output_file = "d_r_thesaurus_300_0.5_network.html"

nt.show(output_file)

d_r_thesaurus_300_0.5_network.html
