In [None]:
#On réutilise nos fonctions créées du TP2

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import glob
import os

nltk.download('stopwords')
nltk.download('punkt')

def read_corpus(file_path):
    with open(file_path, "r") as infile:
        content = infile.read()
        content = content.lower()
        content = re.sub(r"[^\w\s']", ' ', content)
        content = re.sub(r'\s+', ' ', content)
    return content

def split_corpus(content, lang):
  splitted_content = []
  splitted_content = nltk.tokenize.word_tokenize(content, lang)
  return splitted_content

def content_to_dict(splitted_content):
    content_dict = {}
    for word in splitted_content:
        content_dict[word] = content_dict.get(word, 0) + 1
    return content_dict

def content_to_list(content_dict):
    return sorted(content_dict.items(), key=lambda x: x[1], reverse=True)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def compute_p_at_20(most_frequent_words, stopwords_list):
    top_20_frequent = set(most_frequent_words[:20])
    top_20_stopwords = set(stopwords_list[:20])
    common_words = top_20_frequent.intersection(top_20_stopwords)
    return len(common_words) / 20

def identify_language_for_folder(folder_path):
    expected_language = os.path.basename(folder_path)
    print(f"Processing folder: {folder_path} (expected language: {expected_language})")

    files = glob.glob(f"{folder_path}/*.txt")
    languages = ['english', 'french', 'spanish', 'greek', 'finnish', 'dutch']

    stopwords_count = {lang: 0 for lang in languages}

    for file_path in files:
        print(f"\nFile: {file_path}")

        content = read_corpus(file_path)
        splitted_content = split_corpus(content, expected_language)
        content_dict = content_to_dict(splitted_content)
        most_frequent_words = [word for word, _ in content_to_list(content_dict)]

        best_p_at_20 = 0
        best_stopwords_language = None

        for stopwords_lang in languages:
            stopwords_list = stopwords.words(stopwords_lang)
            p_at_20 = compute_p_at_20(most_frequent_words, stopwords_list)
            print(f"P@20 for stopwords in {stopwords_lang}: {p_at_20}")

            if p_at_20 > best_p_at_20:
                best_p_at_20 = p_at_20
                best_stopwords_language = stopwords_lang

        if best_stopwords_language:
            print(f"Best P@20 score for file {file_path} is with stopwords '{best_stopwords_language}': P@20 = {best_p_at_20}")
            stopwords_count[best_stopwords_language] += 1
        else:
            print(f"No stopword list had a significant P@20 score for file {file_path}")

    print("\nStopwords language count across all files in the folder:")
    for stopwords_lang, count in stopwords_count.items():
        print(f"{stopwords_lang}: {count}")

identify_language_for_folder('data/dutch')
identify_language_for_folder('data/english')
identify_language_for_folder('data/finnish')
identify_language_for_folder('data/french')
identify_language_for_folder('data/greek')
identify_language_for_folder('data/spanish')




Processing folder: data/dutch (expected language: dutch)

Stopwords language count across all files in the folder:
english: 0
french: 0
spanish: 0
greek: 0
finnish: 0
dutch: 0
Processing folder: data/english (expected language: english)

Stopwords language count across all files in the folder:
english: 0
french: 0
spanish: 0
greek: 0
finnish: 0
dutch: 0
Processing folder: data/finnish (expected language: finnish)

Stopwords language count across all files in the folder:
english: 0
french: 0
spanish: 0
greek: 0
finnish: 0
dutch: 0
Processing folder: data/french (expected language: french)

Stopwords language count across all files in the folder:
english: 0
french: 0
spanish: 0
greek: 0
finnish: 0
dutch: 0
Processing folder: data/greek (expected language: greek)

Stopwords language count across all files in the folder:
english: 0
french: 0
spanish: 0
greek: 0
finnish: 0
dutch: 0
Processing folder: data/spanish (expected language: spanish)

Stopwords language count across all files in the