In [None]:
import pandas as pd
import pysentiment2 as ps  # <--- Nouvelle librairie
from tqdm import tqdm

# 1. Configuration
FILE_PATH = '../data/headlines.csv'
START_DATE = '2005-01-01'  
END_DATE = '2008-12-31'

# 2. Chargement & Nettoyage (Ton code existant)
df_large = pd.read_csv(FILE_PATH)

col_text = [c for c in df_large.columns if 'title' in c.lower() or 'headline' in c.lower()][0]
col_date = [c for c in df_large.columns if 'date' in c.lower() or 'published' in c.lower()][0]

# Conversion date
df_large[col_date] = pd.to_datetime(df_large[col_date].astype(str), format='%Y%m%d', errors='coerce')
df_large = df_large.dropna(subset=[col_date])

# Filtre temporel
mask = (df_large[col_date] >= START_DATE) & (df_large[col_date] <= END_DATE)
df_filtered = df_large.loc[mask].copy()

print(f"Lignes trouvées : {len(df_filtered)}")

# 3. Initialisation du moteur Loughran-McDonald
lm = ps.LM()  # Charge le dictionnaire Finance

def get_lm_score(text):
    # LM a besoin de tokens (mots séparés), pas de phrases brutes
    tokens = lm.tokenize(str(text))
    score = lm.get_score(tokens)
    # Le score retourné est un dictionnaire : {'Positive': x, 'Negative': y, 'Polarity': z, ...}
    # Polarity = (Pos - Neg) / (Pos + Neg). 
    return score['Polarity']

# 4. Calcul (Si données présentes)
if len(df_filtered) > 0:
    tqdm.pandas()
    print("Calcul du sentiment Loughran-McDonald (Finance Specific)...")
    
    # Attention : C'est un peu plus lent que VADER car la tokenisation est plus stricte
    df_filtered['sentiment_score'] = df_filtered[col_text].progress_apply(get_lm_score)
    
    # 5. Agrégation
    df_filtered['date_only'] = df_filtered[col_date].dt.date
    daily_agg = df_filtered.groupby('date_only')['sentiment_score'].agg(['mean', 'count']).reset_index()
    
    # 6. Sauvegarde
    daily_agg.to_csv('../data/daily_sentiment_lm_final.csv', index=False)
    print("Fichier daily_sentiment_lm_final.csv généré !")
    
else:
    print("Aucune donnée trouvée après filtrage.")

Lignes trouvées : 370399
Calcul du sentiment Loughran-McDonald (Finance Specific)...


100%|██████████| 370399/370399 [00:18<00:00, 19546.65it/s]


Fichier daily_sentiment_lm_final.csv généré !
