In [2]:
import os
import json
import spacy
import csv
import numpy as np
import re
from datetime import datetime


nlp = spacy.load('fr_core_news_lg')

relevant_sources = ['lesoir.be', 'lalibre.be', 'rtbf.be', 'rtlinfo.be', 'dhnet.be', 'sudinfo.be', 'lavenir.net', 'lecho.be', 'levif.be']


def extract_texts(source_path, tmp_path, start_date=None, end_date=None):
    if not os.path.exists(tmp_path):
        os.makedirs(tmp_path)

    for filename in os.listdir(source_path):
        if filename.endswith(".json"):
            date_str = filename.split('_')[1].split('.')[0]
            file_date = datetime.strptime(date_str, '%Y%m%d').date()
            if start_date and file_date < start_date:
                continue
            if end_date and file_date >= end_date:
                continue
            
            output_filename = os.path.splitext(filename)[0]
            tmp_file_path = os.path.join(tmp_path, f"texts_{output_filename}.txt")

            if os.path.exists(tmp_file_path):
                continue

            json_file_path = os.path.join(source_path, filename)
            if os.path.exists(json_file_path) and os.path.getsize(json_file_path) > 0:
                with open(json_file_path, "r", encoding="utf-8") as file:
                    for line in file:
                        json_data = json.loads(line)
                        title = json_data.get("title")
                        text = json_data.get("text")
                        if title and text:
                            title = title.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\f", " ").replace("\v", " ")
                            text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\f", " ").replace("\v", " ")

                            with open(tmp_file_path, "a", encoding="utf-8") as tmp_file:
                                tmp_file.write(title + ". " + text + "\n")
                        elif title:
                            with open(tmp_file_path, "a", encoding="utf-8") as tmp_file:
                                tmp_file.write(title + ".\n")
                        else:
                            with open(tmp_file_path, "a", encoding="utf-8") as tmp_file:
                                tmp_file.write("\n")
            else:
                with open(tmp_file_path, "w", encoding="utf-8") as tmp_file:
                    tmp_file.write("\n")


def tokenize_and_lemmatize(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct]
    return tokens

def preprocess_texts(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            if os.path.exists(output_file_path):
                continue

            with open(input_file_path, "r", encoding="utf-8") as input_file, \
                    open(output_file_path, "w", encoding="utf-8") as output_file:
                if os.path.exists(input_file_path) and os.path.getsize(input_file_path) > 0:
                    for line in input_file:
                        text = line.strip()
                        tokens = tokenize_and_lemmatize(text)
                        processed_text = " ".join(tokens)
                        if processed_text:
                            output_file.write(processed_text + "\n")
                        else:
                            output_file.write("\n")
                else:
                    output_file.write("\n")


def load_dictionary(emotion_dict_path):
    emotion_dict = {}
    regex_patterns = {}

    with open(emotion_dict_path, newline='', encoding='utf-8') as csvfile:
        csv_reader = csv.DictReader(csvfile, delimiter=';')
        for row in csv_reader:
            word = row['word']
            polarity = np.array([int(row['positive']), int(row['negative'])])
            emotions = np.array([int(row['joy']), int(row['fear']), int(row['sadness']),
                                 int(row['anger']), int(row['surprise']), int(row['disgust'])])

            # Compiler l'expression régulière pour trouver des correspondances avec des caractères blancs autour
            regex_patterns[word] = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)

            if word in emotion_dict:
                # Fusionner les vecteurs d'émotions des doublons en prenant la valeur maximale
                emotion_dict[word]['polarity'] = np.maximum(emotion_dict[word]['polarity'], polarity)
                emotion_dict[word]['emotions'] = np.maximum(emotion_dict[word]['emotions'], emotions)
            else:
                emotion_dict[word] = {'polarity': polarity, 'emotions': emotions}

    return emotion_dict, regex_patterns


class Feel:
    def __init__(self, text, emotion_dict, regex_patterns):
        self.text = text.lower()
        self.polarity = np.zeros(2)
        self.emotion = np.zeros(6)
        self.occurrences = {}
        self.emotion_dict = emotion_dict
        self.regex_patterns = regex_patterns
        self.num_words = len(text.split())

    def get_sentiment(self):
        text = self.text
        num_words = self.num_words
        emotions_mapping = {} 
        
        for emotion_token, data in self.emotion_dict.items():
            polarity_value, emotion_value = data['polarity'], data['emotions']
            emotions_mapping[emotion_token] = {
                'polarity': polarity_value,
                'emotions': emotion_value
            }
            
        for emotion_token, data in emotions_mapping.items():
            emotion_data = emotions_mapping.get(emotion_token)

            if emotion_data is not None:
                polarity_value = emotion_data['polarity']
                emotion_value = emotion_data['emotions']

                regex = self.regex_patterns[emotion_token]
                matches = regex.findall(text)
                count = len(matches)

                if count > 0:
                    self.occurrences[emotion_token] = count  

                    if not np.array_equal(polarity_value, [0, 0]):
                        self.polarity += count * polarity_value
                    if not np.array_equal(emotion_value, [0, 0, 0, 0, 0, 0]):
                        self.emotion += count * emotion_value

        if self.num_words != 0:
            self.polarity /= self.num_words
            self.emotion /= self.num_words

        return self.polarity, self.emotion, self.occurrences
    

def process_texts(input_folder, output_folder, emotion_dict, regex_patterns):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            if os.path.exists(output_file_path):
                continue

            with open(input_file_path, "r", encoding="utf-8") as input_file, \
                    open(output_file_path, "w", encoding="utf-8") as output_file:

                if os.path.exists(input_file_path) and os.path.getsize(input_file_path) > 0:

                    for line in input_file:
                        if line:
                            text = line.strip()
                            feel = Feel(text, emotion_dict, regex_patterns) 
                            feel.text = text
                            polarities, emotions, occurrences = feel.get_sentiment()
                            
                            output_data = {
                                'polarity': {
                                    'positive': polarities[0],
                                    'negative': polarities[1]
                                },
                                'emotions': {
                                    'joy': emotions[0],
                                    'fear': emotions[1],
                                    'sadness': emotions[2],
                                    'anger': emotions[3],
                                    'surprise': emotions[4],
                                    'disgust': emotions[5]
                                },
                                'occurrences': occurrences
                            }

                            if occurrences:
                                output_file.write(json.dumps(output_data, ensure_ascii=False) + "\n")
                            else:
                                output_file.write("\n")
                else:
                    output_file.write("\n")
            
            print(f"{filename}")


start_date = datetime(2020, 6, 1).date()  # Date de début (inclusive)
end_date = datetime(2021, 6, 30).date()   # Date de fin (exclusive)

for source in relevant_sources:
    print("===================")
    print(f"{source}")
    print("===================")
        
    # Extraction des titres des fichiers JSON
    json_dir = f"./data/non_covid/{source}"
    tmp_dir = f"./data/tmp/{source}/spacy/non_covid/texts"
    extract_texts(json_dir, tmp_dir, start_date, end_date)
    print("Extraction OK")

    # Prétraitement des textes avec Spacy
    clean_dir = f"./data/tmp/{source}/spacy/non_covid/texts_clean"
    preprocess_texts(tmp_dir, clean_dir)
    print("Preprocessing OK")
    
    # Chargement du dictionnaire d'émotions lemmatisé avec Spacy
    emotion_dict_path = "./lexique/emotion_dictionary_final.csv"
    emotion_dict_final, regex_patterns = load_dictionary(emotion_dict_path)

    # Traitement des textes pour obtenir les vecteurs d'émotion
    emotions_dir = f"./data/tmp/{source}/spacy/non_covid/texts_emotions_len"
    process_texts(clean_dir, emotions_dir, emotion_dict_final, regex_patterns)
    print("Sentiment Analysis OK\n")


lesoir.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

lalibre.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

rtbf.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

rtlinfo.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

dhnet.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

sudinfo.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

lavenir.net
Extraction OK
Preprocessing OK
Sentiment Analysis OK

lecho.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

levif.be
Extraction OK
Preprocessing OK
Sentiment Analysis OK

