# TP4

# Clustering du corpus global

In [None]:
#Pour le clustering
import collections
import os
import string
import sys

import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine

In [None]:
data_path = "../tp4/data/data_all/"

files = [f for f in sorted(os.listdir(data_path)) if f"_" in f]
texts = []
for f in files:
    try:
        with open(data_path + f, "r", encoding="utf-8") as file:
            text = file.read()
            texts.append(text)
    except UnicodeDecodeError:
        with open(data_path + f, "r", encoding="latin-1") as file:
            text = file.read()
            texts.append(text)


# Vectoriser les documents à l'aide de TF-IDF

# Création d'une fonction de pré-traitement
def preprocessing(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens


vectorizer = TfidfVectorizer(
    tokenizer=preprocessing,
    stop_words=stopwords.words('french'),
    max_df=0.5,
    min_df=0.1,
    lowercase=True)


# Construire la matrice de vecteurs à l'aide de la fonction `fit_transform`

tfidf_vectors = vectorizer.fit_transform(texts)




In [None]:
# déterminer le nombre de clusters optimal par essais-erreurs

N_CLUSTERS = 2

km_model = KMeans(n_clusters=N_CLUSTERS)

clusters = km_model.fit_predict(tfidf_vectors)

clustering = collections.defaultdict(list)

for idx, label in enumerate(clusters):
    clustering[label].append(files[idx])


pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(tfidf_vectors.toarray())


x_axis = reduced_vectors[:, 0]
y_axis = reduced_vectors[:, 1]

plt.figure(figsize=(10,10))
scatter = plt.scatter(x_axis, y_axis, s=100, c=clusters)

# Ajouter les centroïdes
centroids = pca.transform(km_model.cluster_centers_)
plt.scatter(centroids[:, 0], centroids[:, 1],  marker = "x", s=100, linewidths = 2, color='black')

# Ajouter la légende
legend_labels = ['Cluster 1', 'Cluster 2']
plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels)
plt.title("Figure 3. Clustering du corpus complet")

# Correspondance clusters - orientation politique

In [None]:
data_path_right = "../tp4/data/data_right/"
data_path_left = "../tp4/data/data_left/"

# Charger les documents du dossier 'data_right'
files_right = [f for f in sorted(os.listdir(data_path_right)) if f"_" in f]
texts_right = []
for f in files_right:
    try:
        with open(data_path_right + f, "r", encoding="utf-8") as file:
            text = file.read()
            texts_right.append(text)
    except UnicodeDecodeError:
        with open(data_path_right + f, "r", encoding="latin-1") as file:
            text = file.read()
            texts_right.append(text)

# Charger les documents du dossier 'data_left'
files_left = [f for f in sorted(os.listdir(data_path_left)) if f"_" in f]
texts_left = []
for f in files_left:
    try:
        with open(data_path_left + f, "r", encoding="utf-8") as file:
            text = file.read()
            texts_left.append(text)
    except UnicodeDecodeError:
        with open(data_path_left + f, "r", encoding="latin-1") as file:
            text = file.read()
            texts_left.append(text)

# Vectoriser les documents à l'aide de TF-IDF

# Création d'une fonction de pré-traitement
def preprocessing(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

# Instancier le modèle TF-IDF avec ses arguments

vectorizer = TfidfVectorizer(
    tokenizer=preprocessing,
    stop_words=stopwords.words('french'),
    max_df=0.5,
    min_df=0.1,
    lowercase=True)

# Vectoriser les documents de chaque dossier à l'aide de TF-IDF

tfidf_vectors_right = vectorizer.fit_transform(texts_right)
tfidf_vectors_left = vectorizer.fit_transform(texts_left)



In [None]:
# Utiliser KMeans pour le clustering sur les documents de chaque dossier
N_CLUSTERS = 1
km_model_right = KMeans(n_clusters=N_CLUSTERS)
km_model_left = KMeans(n_clusters=N_CLUSTERS)

clusters_right = km_model_right.fit_predict(tfidf_vectors_right)
clusters_left = km_model_left.fit_predict(tfidf_vectors_left)

# Représentation graphique avec différenciation des dossiers
# ...
# Votre code pour la représentation graphique en utilisant les clusters_right et clusters_left pour distinguer les dossiers
# ...


In [None]:
# Réduction de dimension pour la visualisation
pca_right = PCA(n_components=2)
reduced_vectors_right = pca_right.fit_transform(tfidf_vectors_right.toarray())

pca_left = PCA(n_components=2)
reduced_vectors_left = pca_left.fit_transform(tfidf_vectors_left.toarray())

# Coordonnées des points pour chaque dossier
x_axis_right = reduced_vectors_right[:, 0]
y_axis_right = reduced_vectors_right[:, 1]

x_axis_left = reduced_vectors_left[:, 0]
y_axis_left = reduced_vectors_left[:, 1]

# Création du graphique
plt.figure(figsize=(10, 10))

# Dossiers 'data_right' et 'data_left' représentés par des couleurs différentes
scatter_right = plt.scatter(
    x_axis_right, y_axis_right, s=10, c='blue',
    label='Presse de droite'
)
scatter_left = plt.scatter(
    x_axis_left, y_axis_left, s=10, c='red',
    label='Presse de gauche'
)

# Ajout des centroïdes pour chaque dossier
centroids_right = pca_right.transform(km_model_right.cluster_centers_)
plt.scatter(centroids_right[:, 0], centroids_right[:, 1], marker="x", s=100, linewidths=2, color='black')

centroids_left = pca_left.transform(km_model_left.cluster_centers_)
plt.scatter(centroids_left[:, 0], centroids_left[:, 1], marker="x", s=100, linewidths=2, color='black')

# Ajout de la légende
plt.legend(handles=[scatter_right, scatter_left], title="Orientation politique")
plt.title("Figure 4. Distribution des documents selon l'orientation politique")
plt.show()
