In [2]:
import os
import glob
import gensim
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
import networkx as nx
from pyvis.network import Network
from gensim.models.callbacks import PerplexityMetric, CallbackAny2Vec
import matplotlib.pyplot as plt

In [6]:
import os
import glob

def extract_sentences_with_word(file_path, target_word):
    sentences_with_word = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            sentences = line.split('.') 
            for sentence in sentences:
                if target_word in sentence:
                    sentences_with_word.append(sentence.strip())
    return sentences_with_word

def create_new_files(corpus_folder, target_word):
    files = glob.glob(os.path.join(corpus_folder, '*.txt'))

    for file_path in files:
        sentences_with_word = extract_sentences_with_word(file_path, target_word)
        
        if sentences_with_word:
            new_file_path = os.path.splitext(file_path)[0] + f'_containing_{target_word}.txt'
            
            with open(new_file_path, 'w', encoding='utf-8') as new_file:
                for sentence in sentences_with_word:
                    new_file.write(sentence + '\n')

if __name__ == "__main__":
    corpus_folder = "Corpus_Thesaurus_textfiles\corpus_containing_pavement"
    target_word = "pavement" 

    create_new_files(corpus_folder, target_word)


In [15]:
def extract_sentences_with_word(file_path, target_word):
    sentences_with_word = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = line.strip().split()  
            sentence = ' '.join(tokens) 
            if target_word in sentence:
                sentences_with_word.append(sentence)
    return sentences_with_word

def create_new_files(input_folder, output_folder, target_word):
    files = glob.glob(os.path.join(input_folder, '*.txt'))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_path in files:
        sentences_with_word = extract_sentences_with_word(file_path, target_word)
        
        if sentences_with_word:
            new_file_path = os.path.join(output_folder, os.path.basename(file_path)[:-4] + f'_containing_{target_word}.txt')
            
            with open(new_file_path, 'w', encoding='utf-8') as new_file:
                for sentence in sentences_with_word:
                    new_file.write(sentence + '\n')

if __name__ == "__main__":
    input_corpus_folder = "Corpus_Thesaurus_textfiles\corpus_number_removed"
    output_corpus_folder = "Corpus_Thesaurus_textfiles\corpus_containing_pavement"
    target_word = "pavement"  

    create_new_files(input_corpus_folder, output_corpus_folder, target_word)


In [16]:
import os
import gensim
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from gensim.models.callbacks import PerplexityMetric, CallbackAny2Vec

class PerplexityCallback(CallbackAny2Vec):
    def __init__(self, corpus, *args, **kwargs):
        super().__init__(self, *args, **kwargs)
        self.corpus = corpus
        self.perplexity_values = []

    def on_epoch_end(self, model):
        perplexity = PerplexityMetric(corpus=self.corpus)
        perplexity_value = perplexity.get_perplexity()
        self.perplexity_values.append(perplexity_value)
        print(f"Epoch #{self.epoch} - Perplexity: {perplexity_value:.2f}")

stop_words = set(stopwords.words('english'))

synonyms = {}

corpus_folder_path = 'Corpus_Thesaurus_textfiles\corpus_containing_pavement'
corpus_sentences = []

for file_name in os.listdir(corpus_folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(corpus_folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences = nltk.sent_tokenize(text) 
            for sentence in sentences:
                words = nltk.word_tokenize(sentence.lower())
                words = [word for word in words if word not in stop_words] 
                corpus_sentences.append(words)

model = gensim.models.Word2Vec(
    corpus_sentences, vector_size=200, window=5,
    min_count=3, workers=4, sg=1, epochs=100,
    callbacks=[PerplexityCallback(corpus=corpus_sentences, progress_per=10)]
)

for word in model.wv.index_to_key:
    similar_words = model.wv.most_similar(word, topn=10)
    synonyms[word] = [(similar_word[0], similar_word[1]) for similar_word in similar_words]

output_file = 'd_r_thesaurus_containing_pavement.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    for word in synonyms:
        similar_words = synonyms[word]
        line = f'{word}: ' + ', '.join([f'{similar_word[0]}({similar_word[1]:.2f})' for similar_word in similar_words])
        file.write(line + '\n')

print("Distributed Representation-based Thesaurus Construction is completed.")

epochs = range(1, len(model.callbacks[0].perplexity_values) + 1)
perplexity_values = model.callbacks[0].perplexity_values

plt.plot(epochs, perplexity_values, marker='o')
plt.title('Perplexity over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Perplexity')
plt.show()


TypeError: object.__init__() takes exactly one argument (the instance to initialize)

In [19]:
stop_words = set(stopwords.words('english'))

synonyms = {}

corpus_folder_path = 'Corpus_Thesaurus_textfiles\corpus_containing_pavement'

corpus_sentences = []

for file_name in os.listdir(corpus_folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(corpus_folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences = nltk.sent_tokenize(text) 
            for sentence in sentences:
                words = nltk.word_tokenize(sentence.lower())
                words = [word for word in words if word not in stop_words] 
                corpus_sentences.append(words)

model = gensim.models.Word2Vec(corpus_sentences, vector_size=200, window=5, min_count=3, workers=4, sg=1, epochs=100)


for word in model.wv.index_to_key:
    similar_words = model.wv.most_similar(word, topn=10)
    synonyms[word] = [(similar_word[0], similar_word[1]) for similar_word in similar_words]

output_file = 'd_r_thesaurus_containing_pavement.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    for word in synonyms:
        similar_words = synonyms[word]
        line = f'{word}: ' + ', '.join([f'{similar_word[0]}({similar_word[1]:.2f})' for similar_word in similar_words])
        file.write(line + '\n')


KeyboardInterrupt: 