In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [4]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0)
print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Device: cuda
Current cuda device: 0
Count of using GPUs: 1


In [6]:
import os
import glob
import gensim
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from tqdm import tqdm
import networkx as nx
from pyvis.network import Network
from gensim.models.callbacks import PerplexityMetric, CallbackAny2Vec
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from sentence_transformers import SentenceTransformer, util


In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
def extract_sentences_with_word(file_path, target_word):
    sentences_with_word = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = line.strip().split()  
            sentence = ' '.join(tokens) 
            if target_word in sentence:
                sentences_with_word.append(sentence)
    return sentences_with_word

def create_new_files(input_folder, output_folder, target_word):
    files = glob.glob(os.path.join(input_folder, '*.txt'))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_path in files:
        sentences_with_word = extract_sentences_with_word(file_path, target_word)
        
        if sentences_with_word:
            new_file_path = os.path.join(output_folder, os.path.basename(file_path)[:-4] + f'_containing_{target_word}.txt')
            
            with open(new_file_path, 'w', encoding='utf-8') as new_file:
                for sentence in sentences_with_word:
                    new_file.write(sentence + '\n')

if __name__ == "__main__":
    input_corpus_folder = "Corpus_Thesaurus_textfiles/corpus_number_removed"
    output_corpus_folder = "Val_Textfiles/val_corpus/corpus_containing_definition"
    target_word = "definition"  

    create_new_files(input_corpus_folder, output_corpus_folder, target_word)

Use first files in each corpus(OG) for calculating sentence similarity score

In [7]:
from tqdm import tqdm

def get_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        return [line.strip() for line in lines if line.strip()]

def calculate_and_save_similarity(file_path_a, file_path_b, output_file_path):
    sentences_a = get_sentences_from_file(file_path_a)
    sentences_b = get_sentences_from_file(file_path_b)

    model = SentenceTransformer('paraphrase-MiniLM-L6-v2').cuda()

    # exit()
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        total_similarities = len(sentences_a) * len(sentences_b)
        pbar = tqdm(total=total_similarities, desc="Calculating Similarities")

        for sentence_a in sentences_a:
            for sentence_b in sentences_b:
                embeddings_a = model.encode(sentence_a, convert_to_tensor=True)
                embeddings_b = model.encode(sentence_b, convert_to_tensor=True)

                similarity_score = util.pytorch_cos_sim(embeddings_a, embeddings_b).item()

                output_file.write(f"Similarity between '{sentence_a}' and '{sentence_b}': {similarity_score:.4f}\n")
                pbar.update(1)

        pbar.close()

file_path_a = 'Val_Textfiles/val_corpus/corpus_definition_onlyfirstfile/Australia_Northern Territory_2017_containing_definition.txt'
file_path_b = 'Val_Textfiles/val_corpus/corpus_og_onlyfirstfile/Australia_Northern Territory_2017.txt'
output_file_path = '0_evaluation_with_S-BERT_sentences.txt'

calculate_and_save_similarity(file_path_a, file_path_b, output_file_path)

Calculating Similarities: 100%|██████████| 80283/80283 [20:36<00:00, 64.93it/s]



Replace each word with a similar word

In [None]:
import os

thesaurus_path = 'Corpus_Thesaurus_textfiles/d_r_based_thesaurus_files/d_r_based_thesaurus_20015_epoch500.txt'
with open(thesaurus_path, 'r', encoding='utf-8') as thesaurus_file:
    thesaurus_data = {}
    for line in thesaurus_file:
        parts = line.split(':')
        base_word = parts[0].strip()
        synonyms_with_similarity = [syn.strip().split('(') for syn in parts[1].split(',')]
        synonyms = [(syn[0], float(syn[1].replace(')', ''))) for syn in synonyms_with_similarity]
        thesaurus_data[base_word] = synonyms

def replace_words_with_thesaurus(word, thesaurus, similarity_threshold=0):
    best_synonym = word
    best_similarity = similarity_threshold 

    for synonym, similarity in thesaurus:
        if similarity > best_similarity:
            best_similarity = similarity
            best_synonym = synonym

    return best_synonym

def process_file(file_path, thesaurus):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
        sentences = text.split('\n')  
        replaced_sentences = []

        for sentence in sentences:
            replaced_words = [replace_words_with_thesaurus(word, thesaurus.get(word, [(word, 1.0)])) for word in sentence.split()]
            replaced_sentence = ' '.join(replaced_words).strip()
            if replaced_sentence:
                replaced_sentences.append(replaced_sentence)

        new_file_path = os.path.join(new_folder_path, 'Replaced_' + os.path.basename(file_path)) 
        with open(new_file_path, 'w', encoding='utf-8') as new_file:
            new_file.write('\n'.join(replaced_sentences))

new_folder_path = 'Val_Textfiles/val_corpus/switched_corpus_only_definition'
folder_path = 'Val_Textfiles/val_corpus/corpus_definition_onlyfirstfile'

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path):
        process_file(file_path, thesaurus_data)

In [None]:
import os

thesaurus_path = 'Corpus_Thesaurus_textfiles/d_r_based_thesaurus_files/d_r_based_thesaurus_20015_epoch500.txt'
with open(thesaurus_path, 'r', encoding='utf-8') as thesaurus_file:
    thesaurus_data = {}
    for line in thesaurus_file:
        parts = line.split(':')
        base_word = parts[0].strip()
        synonyms_with_similarity = [syn.strip().split('(') for syn in parts[1].split(',')]
        synonyms = [(syn[0], float(syn[1].replace(')', ''))) for syn in synonyms_with_similarity]
        thesaurus_data[base_word] = synonyms

def replace_words_with_thesaurus(word, thesaurus, similarity_threshold=0):
    best_synonym = word
    best_similarity = similarity_threshold 

    for synonym, similarity in thesaurus:
        if similarity > best_similarity:
            best_similarity = similarity
            best_synonym = synonym

    return best_synonym

def process_file(file_path, thesaurus):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
        sentences = text.split('\n')  
        replaced_sentences = []

        for sentence in sentences:
            replaced_words = [replace_words_with_thesaurus(word, thesaurus.get(word, [(word, 1.0)])) for word in sentence.split()]
            replaced_sentence = ' '.join(replaced_words).strip()
            if replaced_sentence:
                replaced_sentences.append(replaced_sentence)

        new_file_path = os.path.join(new_folder_path, 'Replaced_' + os.path.basename(file_path)) 
        with open(new_file_path, 'w', encoding='utf-8') as new_file:
            new_file.write('\n'.join(replaced_sentences))

new_folder_path = 'Val_Textfiles/val_corpus/switched_corpus_og'
folder_path = 'Val_Textfiles/val_corpus/corpus_og_onlyfirstfile'

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path):
        process_file(file_path, thesaurus_data)

Use first files in each corpus(Replaced) for calculating sentence similarity score

In [8]:
def get_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        return [line.strip() for line in lines if line.strip()]

def calculate_and_save_similarity(file_path_a, file_path_b, output_file_path):
    sentences_a = get_sentences_from_file(file_path_a)
    sentences_b = get_sentences_from_file(file_path_b)

    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        total_similarities = len(sentences_a) * len(sentences_b)
        pbar = tqdm(total=total_similarities, desc="Calculating Similarities")

        for sentence_a in sentences_a:
            for sentence_b in sentences_b:
                embeddings_a = model.encode(sentence_a, convert_to_tensor=True)
                embeddings_b = model.encode(sentence_b, convert_to_tensor=True)

                similarity_score = util.pytorch_cos_sim(embeddings_a, embeddings_b).item()

                output_file.write(f"Similarity between '{sentence_a}' and '{sentence_b}': {similarity_score:.4f}\n")
                pbar.update(1)

        pbar.close()

file_path_a = 'Val_Textfiles/val_corpus/switched_corpus_only_definition/Replaced_Australia_Northern Territory_2017_containing_definition.txt'
file_path_b = 'Val_Textfiles/val_corpus/switched_corpus_og/Replaced_Australia_Northern Territory_2017.txt'
output_file_path = '1_evaluation_with_S-BERT_sentences.txt'

calculate_and_save_similarity(file_path_a, file_path_b, output_file_path)

Calculating Similarities: 100%|██████████| 80283/80283 [21:26<00:00, 62.41it/s]


Comparison of similarity between the original corpus and the word-substituted corpus

In [9]:
file_path_a = '0_evaluation_with_S-BERT_sentences.txt'
file_path_b = '1_evaluation_with_S-BERT_sentences.txt'
output_merged_file_path = 'output_merged_lines.txt'

with open(file_path_a, 'r', encoding='utf-8') as file_a, \
     open(file_path_b, 'r', encoding='utf-8') as file_b, \
     open(output_merged_file_path, 'w', encoding='utf-8') as output_file:
    
    lines_a = file_a.readlines()
    lines_b = file_b.readlines()

    for line_a, line_b in zip(lines_a, lines_b):
        output_file.write(f"{line_a.strip()}\n{line_b.strip()}\n")


In [16]:
def filter_lines_by_similarity(lines):
    i = 0
    while i < len(lines):
        line_a = lines[i].strip()
        line_b = lines[i + 1].strip()

        similarity_score = float(line_b[-6:])

        if similarity_score < 0.5:
            del lines[i:i + 2]
        else:
            i += 2  

    return lines

output_merged_file_path = 'output_merged_lines.txt'

with open(output_merged_file_path, 'r', encoding='utf-8') as output_file:
    merged_lines = output_file.readlines()

    filtered_lines = filter_lines_by_similarity(merged_lines)

with open(output_merged_file_path, 'w', encoding='utf-8') as output_file:
    output_file.writelines(filtered_lines)


In [None]:
def compare_and_remove_lines(lines):
    i = 0
    while i < len(lines) - 1:  # Ensure there is a pair of lines to compare
        line_a = lines[i].strip()
        line_b = lines[i + 1].strip()

        similarity_a = float(line_a[-6:])
        similarity_b = float(line_b[-6:])

        if similarity_b < similarity_a:
            # Remove the pair of lines
            del lines[i:i + 2]
        else:
            i += 2  # Move to the next pair of lines

    return lines

# Example usage:
output_merged_file_path = 'output_merged_lines_th0.5.txt'

with open(output_merged_file_path, 'r', encoding='utf-8') as output_file:
    merged_lines = output_file.readlines()

    # Filter and remove lines based on similarity
    filtered_lines = filter_lines_by_similarity(merged_lines)

    # Compare and remove lines based on the new criteria
    final_filtered_lines = compare_and_remove_lines(filtered_lines)

# Write the final filtered lines back to the merged file
with open(output_merged_file_path, 'w', encoding='utf-8') as output_file:
    output_file.writelines(final_filtered_lines)


Extract sentences containing the word 'pavement' to create new corpus files

In [None]:
def extract_sentences_with_word(file_path, target_word):
    sentences_with_word = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = line.strip().split()  
            sentence = ' '.join(tokens) 
            if target_word in sentence:
                sentences_with_word.append(sentence)
    return sentences_with_word

def create_new_files(input_folder, output_folder, target_word):
    files = glob.glob(os.path.join(input_folder, '*.txt'))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_path in files:
        sentences_with_word = extract_sentences_with_word(file_path, target_word)
        
        if sentences_with_word:
            new_file_path = os.path.join(output_folder, os.path.basename(file_path)[:-4] + f'_containing_{target_word}.txt')
            
            with open(new_file_path, 'w', encoding='utf-8') as new_file:
                for sentence in sentences_with_word:
                    new_file.write(sentence + '\n')

if __name__ == "__main__":
    input_corpus_folder = "Corpus_Thesaurus_textfiles/corpus_number_removed"
    output_corpus_folder = "Val_Textfiles/val_corpus/corpus_containing_pavement"
    target_word = "pavement"  

    create_new_files(input_corpus_folder, output_corpus_folder, target_word)


In [None]:
stop_words = set(stopwords.words('english'))

synonyms = {}

corpus_folder_path = 'Val_Textfiles/val_corpus/corpus_containing_pavement'

corpus_sentences = []

for file_name in os.listdir(corpus_folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(corpus_folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            sentences = nltk.sent_tokenize(text) 
            for sentence in sentences:
                words = nltk.word_tokenize(sentence.lower())
                words = [word for word in words if word not in stop_words] 
                corpus_sentences.append(words)

model = gensim.models.Word2Vec(corpus_sentences, vector_size=200, window=10, 
                               min_count=5, workers=5, sg=1, epochs=300) 
                                #corpus_sentences, vector_size=200, window=5, min_count=3, workers=4, sg=1, epochs=500
                                #corpus_sentences, vector_size=300, window=5, min_count=5, workers=4, sg=1, epochs=500
epochs = 500
for epoch in tqdm(range(epochs), desc="Training Progress"):
    model.train(corpus_sentences, total_examples=model.corpus_count, epochs=1)

for word in model.wv.index_to_key:
    similar_words = model.wv.most_similar(word, topn=10)
    synonyms[word] = [(similar_word[0], similar_word[1]) for similar_word in similar_words]

output_file = 'd_r_thesaurus_containing_pavement_300_5_500.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    for word in synonyms:
        similar_words = synonyms[word]
        line = f'{word}: ' + ', '.join([f'{similar_word[0]}({similar_word[1]:.2f})' for similar_word in similar_words])
        file.write(line + '\n')

In [None]:
embedding_thesaurus_file = 'Val_Textfiles/val_thesaurus/d_r_thesaurus_containing_pavement_500.txt'

new_embedding_thesaurus_file = 'd_r_thesaurus_containing_pavement_500_0.5.txt'
selected_entries = [] 

with open(embedding_thesaurus_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        word, synonyms_str = line.split(': ', 1)
        synonyms_list = synonyms_str.split(', ')
        selected_synonyms = []
        for synonym in synonyms_list:
            synonym_word, similarity = synonym.split('(')
            similarity = float(similarity[:-1])
            if similarity >= 0.5:
                selected_synonyms.append(f"{synonym_word}({similarity})")
        if selected_synonyms:
            selected_entries.append(f"{word}: {', '.join(selected_synonyms)}")

with open(new_embedding_thesaurus_file, 'w', encoding='utf-8') as file:
    for entry in selected_entries:
        file.write(entry + '\n')

In [None]:
thesaurus_file = 'd_r_thesaurus_containing_pavement_500_0.5.txt'

G = nx.Graph()

with open(thesaurus_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        word, synonyms_str = line.split(': ', 1)
        synonyms_list = synonyms_str.split(', ')
        G.add_node(word)  
        for synonym in synonyms_list:
            synonym_word, similarity = synonym.split('(')
            similarity = float(similarity[:-1])
            G.add_edge(word, synonym_word, weight=similarity)  

layout = nx.spring_layout(G, seed=42)

nt = Network(notebook=True, height='800px', width='100%')

for node in G.nodes():
    nt.add_node(node, label=node, title=node)

for edge in G.edges():
    source, target, weight = edge[0], edge[1], G.edges[edge]['weight']
    nt.add_edge(source, target, value=weight, title=f'Weight: {weight}')

output_file = "d_r_thesaurus_500_0.5_network.html"

nt.show(output_file)

Extract "definition of pavement" in corpus

In [None]:
import os

input_folder = 'Val_Textfiles/val_corpus/corpus_containing_pavement'
output_folder = 'Val_Textfiles/val_corpus/definition_of_pavement_in_corpus'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

def extract_definition_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        sentences = content.split('.')
        
        definition_sentences = []
        for i, sentence in enumerate(sentences):
            if 'definition' in sentence.lower():
                start_idx = max(0, i - 2)
                end_idx = min(len(sentences), i + 3)
                extracted_sentences = sentences[start_idx:end_idx]
                definition_sentences.append(' '.join(extracted_sentences).strip())
                
        return definition_sentences

for file_name in os.listdir(input_folder):
    if file_name.endswith('.txt'):
        file_path = os.path.join(input_folder, file_name)
        definition_sentences = extract_definition_sentences(file_path)
        output_file_path = os.path.join(output_folder, f'{file_name}_definition.txt')
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for sentence in definition_sentences:
                output_file.write(sentence + '\n')


In [None]:
def get_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()  
        sentences = text.split('.')  
        return [sentence.strip() for sentence in sentences if sentence.strip()]

folder_path = 'Val_Textfiles/val_corpus/definition_of_pavement_in_corpus'

file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith('.txt')]

sentences_per_file = {file_path: get_sentences_from_file(file_path) for file_path in file_paths}

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

for file1, sentences1 in sentences_per_file.items():
    for file2, sentences2 in sentences_per_file.items():
        if file1 != file2: 
            embeddings1 = model.encode(sentences1, convert_to_tensor=True)
            embeddings2 = model.encode(sentences2, convert_to_tensor=True)

            file1_name = os.path.basename(file1)
            file2_name = os.path.basename(file2)

            similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2)
            average_similarity = similarity_matrix.mean().item()

            print(f"Similarity between {file1_name} and {file2_name}: {average_similarity:.4f}")


In [None]:
thesaurus_path = 'Val_Textfiles/val_thesaurus/d_r_thesaurus_containing_pavement_300_10.txt'
with open(thesaurus_path, 'r', encoding='utf-8') as thesaurus_file:
    thesaurus_data = {}
    for line in thesaurus_file:
        parts = line.split(':')
        base_word = parts[0].strip()
        synonyms_with_similarity = [syn.strip().split('(') for syn in parts[1].split(',')]
        synonyms = [(syn[0], float(syn[1].replace(')', ''))) for syn in synonyms_with_similarity]
        thesaurus_data[base_word] = synonyms

def replace_words_with_thesaurus(word, thesaurus, similarity_threshold=0.7): #수정
    best_synonym = word
    best_similarity = similarity_threshold 

    for synonym, similarity in thesaurus:
        if similarity > best_similarity:
            best_similarity = similarity
            best_synonym = synonym

    return best_synonym

def process_file(file_path, thesaurus):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
        sentences = text.split('.')
        replaced_sentences = []

        for sentence in sentences:
            replaced_words = [replace_words_with_thesaurus(word, thesaurus.get(word, [(word, 1.0)])) for word in sentence.split()]
            replaced_sentence = ' '.join(replaced_words)
            replaced_sentences.append(replaced_sentence)

    new_file_path = os.path.join(new_folder_path, '0.7_' + os.path.basename(file_path)) #수정
    with open(new_file_path, 'w', encoding='utf-8') as new_file:
        new_file.write('. '.join(replaced_sentences))

new_folder_path = 'Val_Textfiles/val_corpus/Switched_definition_of_pavement_in_corpus'
folder_path = 'Val_Textfiles/val_corpus/definition_of_pavement_in_corpus'

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path):
        process_file(file_path, thesaurus_data)


In [None]:
def get_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()  
        sentences = text.split('.')  
        return [sentence.strip() for sentence in sentences if sentence.strip()]

folder_path = 'Val_Textfiles/val_corpus/Switched_definition_of_pavement_in_corpus'

file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith('.txt')]

sentences_per_file = {file_path: get_sentences_from_file(file_path) for file_path in file_paths}

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

for file1, sentences1 in sentences_per_file.items():
    for file2, sentences2 in sentences_per_file.items():
        if file1 != file2: 
            embeddings1 = model.encode(sentences1, convert_to_tensor=True)
            embeddings2 = model.encode(sentences2, convert_to_tensor=True)

            file1_name = os.path.basename(file1)
            file2_name = os.path.basename(file2)

            similarity_matrix = util.pytorch_cos_sim(embeddings1, embeddings2)
            average_similarity = similarity_matrix.mean().item()

            print(f"Similarity between {file1_name} and {file2_name}: {average_similarity:.4f}")
