# TP N°03 : Techniques Avancées de Prétraitement de Textes (devoir maison)

### importation des bibliotheque nécessaires

In [48]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re

### Téléchargement des ressources NLTK nécessaires

In [49]:

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to C:\Users\user.user-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user.user-PC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\user.user-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### importation du corpus

In [50]:
df =pd.read_csv("data/all-data.csv",header=None)

### renomer les colonnes 

In [51]:
noms_colonnes = ['colonne1', 'colonne2']
df.columns = noms_colonnes

In [52]:
df.head()

Unnamed: 0,colonne1,colonne2
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


# ------------------------------------------------------------------------------------------------------------

### Texte à Prétraiter

In [53]:
text=df['colonne2'][0:10]
text = ' '.join(text)

In [54]:
text


"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing . Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said . The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported . With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability . According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales . FINANCING OF ASPOCOMP 'S GROWTH Aspocomp

# ------------------------------------------------------------------------------------------------------------

# Étape 1 : Pré-traitement

### 1). Nettoyage du Texte

In [68]:
# Suppression des balises HTML
text_cleaned = re.sub(r'<.*?>', '', text)
# Suppression des adresses email
text_cleaned = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text_cleaned) 

### 2). Normalisation du Texte

In [56]:
text_normalized = text_cleaned.lower()

### 3). Segmentation en Phrases et Tokenisation

In [57]:
sentences = sent_tokenize(text_normalized)
sentences_tokens = [word_tokenize(sentence) for sentence in sentences]

### 4). Suppression de la Ponctuation et Création des nouvelles données

In [58]:
lemmatizer = WordNetLemmatizer()
# Dictionnaire pour la correspondance entre les POS tags de NLTK et ceux de WordNet
POS_TAG_MAP = {
    'N': 'n',  # Noun
    'V': 'v',  # Verb
    'R': 'r',  # Adverb
    'J': 'a'   # Adjective
}

In [59]:
# Convertir les POS tags de NLTK à ceux utilisés par WordNet
def get_wordnet_pos(tag):
    return POS_TAG_MAP.get(tag[0], 'n') # si aucune correspondance, attribuer le POS 'n' par défaut

data = []
for sentence in sentences_tokens:
    for token in sentence:
        if token.isalnum():  # Suppression de la ponctuation
            pos = nltk.pos_tag([token])[0][1]
            lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos))
    
            data.append([token, lemma, pos])
    data.append([' ', ' ', ' '])  # Ligne pour séparer les phrases


# ------------------------------------------------------------------------------------------------------------

# Étape 2 : Conversion sous forme de DataFrame Pandas

In [60]:
df = pd.DataFrame(data, columns=['token', 'lemme', 'pos'])

In [61]:
df = df[:-1]

In [63]:
df.head()

Unnamed: 0,token,lemme,pos
0,according,accord,VBG
1,to,to,TO
2,gran,gran,NN
3,the,the,DT
4,company,company,NN


In [65]:
# chemin_fichier_csv = 'data/df_du_tp3.csv'
# df.to_csv(chemin_fichier_csv, index=False)