# TP N°03 : Techniques Avancées de Prétraitement de Textes

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# Téléchargement des ressources NLTK nécessaires
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to C:\Users\user.user-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user.user-PC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\user.user-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Texte à pré-traiter
text = """
Discover our current <b>special offers</b>! Over 1000 products on promotion. 
Contact: <a href="mailto:service@sale.com">service@sale.com</a>. 
Follow us on <a href="https://www.facebook.com">Facebook</a>! 
Terms and conditions of sale and legal notices available at the bottom of the page.
"""

In [4]:
# 1. Nettoyage du Texte
# Suppression des balises HTML
text_cleaned = re.sub(r'<.*?>', '', text)
# Suppression des adresses email
text_cleaned = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text_cleaned) 


In [5]:
# 2. Normalisation du Texte
text_normalized = text_cleaned.lower()

In [6]:
# 3. Segmentation en Phrases et Tokenisation
sentences = sent_tokenize(text_normalized)
sentences_tokens = [word_tokenize(sentence) for sentence in sentences]
#print(tokens)

In [7]:
# 4. Suppression de la Ponctuation et Création des nouvelles données
lemmatizer = WordNetLemmatizer()
# Dictionnaire pour la correspondance entre les POS tags de NLTK et ceux de WordNet
POS_TAG_MAP = {
    'N': 'n',  # Noun
    'V': 'v',  # Verb
    'R': 'r',  # Adverb
    'J': 'a'   # Adjective
}


In [8]:
# Convertir les POS tags de NLTK à ceux utilisés par WordNet
def get_wordnet_pos(tag):
    return POS_TAG_MAP.get(tag[0], 'n') # si aucune correspondance, attribuer le POS 'n' par défaut

data = []
for sentence in sentences_tokens:
    for token in sentence:
        if token.isalnum():  # Suppression de la ponctuation
            pos = nltk.pos_tag([token])[0][1]
            lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos))
    
            data.append([token, lemma, pos])
    data.append([' ', ' ', ' '])  # Ligne pour séparer les phrases


In [9]:
# 5. Construction du DataFrame
df = pd.DataFrame(data, columns=['token', 'lemme', 'pos'])


# Suppression de la dernière ligne de séparation

In [10]:

df = df[:-1]



### Affichage du DataFrame

In [11]:
df

Unnamed: 0,token,lemme,pos
0,discover,discover,NN
1,our,our,PRP$
2,current,current,JJ
3,special,special,JJ
4,offers,offer,NNS
5,,,
6,over,over,IN
7,1000,1000,CD
8,products,product,NNS
9,on,on,IN
