In [1]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

class TextPreprocessor:
    def preprocess_text(self, tokens, tokenize=True, normalize=True, remove_stopwords=True, stem=True):
        if tokenize:
            tokens = word_tokenize(tokens)
        else:
            tokens = re.findall(r'\b\w+\b', tokens)

        if normalize:
            tokens = [word.lower() for word in tokens if word.isalpha()]

        if remove_stopwords:
            stop_words = set(stopwords.words('english'))
            tokens = [word for word in tokens if word not in stop_words]

        if stem:
            stemmer = PorterStemmer()
            tokens = [stemmer.stem(word) for word in tokens]

        return tokens

# Charger les données depuis 'DATA.txt'
with open('C:\\Users\\poste\\Desktop\\build\\DATA-OF-ALTERNATIVE-MEDICINE-1.txt', 'r', encoding='utf-8') as file:
    data = file.read()

# Initialiser le préprocesseur de texte
preprocessor = TextPreprocessor()

# Diviser les documents à partir du mot-clé "D1:"
documents = re.split(r'(D[0-9]+:)', data)[1:]

# Liste pour stocker les résultats traités
processed_documents = []

# Traitement de chaque document individuellement
for i in range(0, len(documents), 2):
    document_name = documents[i]
    document_content = documents[i + 1]
    
    processed_document = preprocessor.preprocess_text(document_content)
    processed_documents.append((document_name, processed_document))

# Enregistrement des résultats dans un nouveau fichier texte
output_file_path = 'C:\\Users\\poste\\Desktop\\build\\processed_documents.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for document_name, processed_document in processed_documents:
        output_file.write(f"{document_name}\n")
        output_file.write(" ".join(processed_document))
        output_file.write("\n" + "="*50 + "\n")

print(f"Les résultats ont été enregistrés dans le fichier : {output_file_path}")



Les résultats ont été enregistrés dans le fichier : C:\Users\poste\Desktop\build\processed_documents.txt


In [2]:
from collections import defaultdict
import re
import json

# Function to preprocess a document (tokenization, lowercasing, etc.) and store word positions
def preprocess_with_positions(text):
    text = text.lower()  # Convert text to lowercase
    words = re.findall(r'\b\w+\b', text)  # Tokenize the text into words
    word_positions = defaultdict(list)
    for idx, word in enumerate(words):
        word_positions[word].append(idx)
    return word_positions

# Function to create inverted index from documents with word positions
def create_inverted_index_with_positions(documents):
    inverted_index = defaultdict(list)
    for doc_id, doc_text in documents.items():
        word_positions = preprocess_with_positions(doc_text)
        for word, positions in word_positions.items():
            inverted_index[word].append([doc_id[1:], positions])  # Exclude the first character 'D'
    return inverted_index

# Read the contents of the text file with preprocessed documents
file_path = 'C:\\Users\\poste\\Desktop\\build\\processed_documents.txt'  # Replace with your file path
documents = {}

# Read and parse the documents
with open(file_path, 'r') as file:
    lines = file.readlines()
    doc_id = None
    doc_text = ""
    for line in lines:
        line = line.strip()
        if line.startswith("D"):
            if doc_id:
                documents[doc_id] = doc_text
            doc_id = line
            doc_text = ""
        elif line.startswith("="):
            continue
        else:
            doc_text += line

    # Add the last document
    if doc_id:
        documents[doc_id] = doc_text

# Create inverted index with word positions
inverted_index_positions = create_inverted_index_with_positions(documents)

# Save the formatted inverted index to a JSON file
output_file_path = 'C:\\Users\\poste\\Desktop\\build\\inverted-index.json'
with open(output_file_path, 'w') as output_file:
    json.dump(inverted_index_positions, output_file, indent=None)

print(f"Inverted index saved to {output_file_path}")


Inverted index saved to C:\Users\poste\Desktop\build\inverted-index.json


In [10]:
import math

# Function to calculate TF-IDF for each word across all documents
def calculate_tfidf_for_words(inverted_index, total_docs, documents):
    tfidf = {}
    for term, doc_positions in inverted_index.items():
        doc_freq = len(doc_positions)
        idf = math.log(total_docs / doc_freq) if doc_freq > 0 else 0

        for doc_info in doc_positions:
            doc_id, positions = doc_info[0], doc_info[1]

            # Remove the colon from the document ID
            doc_id = doc_id[:-1]

            # Check if the document ID exists in the documents list
            if doc_id in documents:
                tf = len(positions) / len(documents[doc_id])
                tfidf.setdefault(term, {})[doc_id] = tf * idf
            
    return tfidf

# Example usage:
# Use the inverted_index_positions created previously

# Assuming documents is a dictionary
# Calculate TF-IDF for each word
total_documents = len(documents)
tfidf_scores_for_words = calculate_tfidf_for_words(inverted_index_positions, total_documents, documents)

# Save the TF-IDF scores to a text file in the desired format
tfidf_output_file_path = 'C:\\Users\\poste\\Desktop\\build\\tfidf-result.txt'
with open(tfidf_output_file_path, 'w') as tfidf_output_file:
    for term, doc_scores in tfidf_scores_for_words.items():
        tfidf_output_file.write(f"{term}:\n")
        for doc_id, score in doc_scores.items():
            tfidf_output_file.write(f"{doc_id}: {score}\n")

print(f"TF-IDF scores saved to {tfidf_output_file_path}")


TF-IDF scores saved to C:\Users\poste\Desktop\build\tfidf-result.txt


In [6]:
import numpy as np

# Function to calculate weights for each word using TF-IDF scores
def calculate_weights(tfidf_scores_for_words):
    word_weights = {}
    for word, doc_scores in tfidf_scores_for_words.items():
        scores_array = np.array(list(doc_scores.values()))
        normalized_scores = scores_array / np.linalg.norm(scores_array)
        word_weights[word] = normalized_scores
    return word_weights

# Example usage:
# Use the tfidf_scores_for_words dictionary calculated previously

# Calculate weights for each word
word_weights = calculate_weights(tfidf_scores_for_words)

# Display word weights
for word, weights in word_weights.items():
    print(f'{word}: {weights}')

# Save the word weights to a text file
weights_output_file_path = 'C:\\Users\\poste\\Desktop\\build\\weight-result.txt'
with open(weights_output_file_path, 'w') as weights_output_file:
    for word, weights in word_weights.items():
        weights_output_file.write(f'{word}: {weights}\n')

print(f"Word weights saved to {weights_output_file_path}")

scale: [1.]
human: [0.59071647 0.33712227 0.7330775 ]
immunodefici: [0.86851535 0.49566227]
viru: [0.59071647 0.33712227 0.7330775 ]
hiv: [0.46599092 0.47231659 0.26594131 0.22138275 0.15997236 0.57829344
 0.28287796]
epidem: [0.90324966 0.42911543]
exceed: [0.8830479  0.46928287]
expect: [0.86851535 0.49566227]
sinc: [1.]
identif: [1.]
year: [0.86851535 0.49566227]
ago: [1.]
global: [0.90324966 0.42911543]
estim: [1.]
million: [0.70232381 0.71185762]
peopl: [0.34566047 0.35035269 0.19726865 0.11866352 0.58880567 0.18369619
 0.54719396 0.15785514]
current: [0.70232381 0.71185762]
live: [0.32659268 0.33102607 0.11211764 0.40530062 0.55632518 0.1735629
 0.51700891]
alreadi: [0.70232381 0.71185762]
die: [1.]
worst: [1.]
center: [1.]
africa: [0.47671512 0.48318636 0.25334327 0.54621067 0.42041018]
spread: [0.59071647 0.33712227 0.7330775 ]
greater: [1.]
predict: [1.]
impact: [0.87013226 0.49281827]
social: [1.]
capit: [1.]
popul: [0.90324966 0.42911543]
structur: [1.]
econom: [1.]
growth: 

In [16]:
# Display word weights in a clearer format with document numbers
for word, weights in word_weights.items():
    print(f'{word}:')
    doc_scores = tfidf_scores_for_words[word]
    sorted_docs = sorted(doc_scores.items(), key=lambda x: x[0])
    for doc_id, weight in sorted_docs:
        print(f'    Document {doc_id}: {weight:.4f}')
    print()


scale:
    Document D1:: 0.0075

human:
    Document D1:: 0.0051
    Document D3:: 0.0029
    Document D6:: 0.0063

immunodefici:
    Document D1:: 0.0060
    Document D3:: 0.0034

viru:
    Document D1:: 0.0051
    Document D3:: 0.0029
    Document D6:: 0.0063

hiv:
    Document D1:: 0.0032
    Document D2:: 0.0032
    Document D3:: 0.0018
    Document D4:: 0.0015
    Document D5:: 0.0011
    Document D6:: 0.0039
    Document D7:: 0.0019

epidem:
    Document D1:: 0.0060
    Document D4:: 0.0028

exceed:
    Document D1:: 0.0060
    Document D9:: 0.0032

expect:
    Document D1:: 0.0060
    Document D3:: 0.0034

sinc:
    Document D1:: 0.0075

identif:
    Document D1:: 0.0075

year:
    Document D1:: 0.0060
    Document D3:: 0.0034

ago:
    Document D1:: 0.0075

global:
    Document D1:: 0.0060
    Document D4:: 0.0028

estim:
    Document D1:: 0.0075

million:
    Document D1:: 0.0060
    Document D2:: 0.0061

peopl:
    Document D11:: 0.0046
    Document D1:: 0.0029
    Document D