In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/fake_or_real_news.csv")

# Aperçu du dataset
print(df.head())
print(df.info())
print(df['label'].value_counts())

# Distribution des longueurs de textes
df['text_length'] = df['text'].apply(len)
plt.figure(figsize=(10,5))
sns.histplot(df['text_length'], bins=50)
plt.title("Distribution de la longueur des textes")
plt.show()

# Exemple de mots fréquents pour FAKE et REAL
from sklearn.feature_extraction.text import CountVectorizer
from src.preprocessing import clean_text

def plot_top_words(df, label, n=20):
    texts = df[df['label']==label]['text'].apply(clean_text)
    cv = CountVectorizer(max_features=1000)
    X = cv.fit_transform(texts)
    sum_words = X.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)[:n]
    
    words, freqs = zip(*words_freq)
    plt.figure(figsize=(10,5))
    sns.barplot(x=freqs, y=words)
    plt.title(f"Top {n} mots pour {label}")
    plt.show()

plot_top_words(df, "FAKE")
plot_top_words(df, "REAL")