In [1]:
# Installation des bibliothèques nécessaires
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

In [2]:
# Téléchargement des ressources NLTK nécessaires
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Chargement des données depuis le fichier sample.csv
df = pd.read_csv('sample.csv')

In [4]:
# Affichage de la colonne text_cleaned avant nettoyage
print("Avant nettoyage :")
print(df['text'])

Avant nettoyage :
0     @AppleSupport causing the reply to be disregar...
1     @105835 Your business means a lot to us. Pleas...
2     @76328 I really hope you all change but I'm su...
3     @105836 LiveChat is online at the moment - htt...
4     @VirginTrains see attached error message. I've...
                            ...                        
88    @105860 I wish Amazon had an option of where I...
89    They reschedule my shit for tomorrow https://t...
90    @105861 Hey Sara, sorry to hear of the issues ...
91    @Tesco bit of both - finding the layout cumber...
92    @105861 If that doesn't help please DM your fu...
Name: text, Length: 93, dtype: object


In [5]:
# Mise en minuscules
df['text_cleaned'] = df['text'].apply(lambda x: x.lower())

# Affichage de la colonne text_cleaned après mise en minuscules
print("\nAprès mise en minuscules :")
print(df['text_cleaned'])


Après mise en minuscules :
0     @applesupport causing the reply to be disregar...
1     @105835 your business means a lot to us. pleas...
2     @76328 i really hope you all change but i'm su...
3     @105836 livechat is online at the moment - htt...
4     @virgintrains see attached error message. i've...
                            ...                        
88    @105860 i wish amazon had an option of where i...
89    they reschedule my shit for tomorrow https://t...
90    @105861 hey sara, sorry to hear of the issues ...
91    @tesco bit of both - finding the layout cumber...
92    @105861 if that doesn't help please dm your fu...
Name: text_cleaned, Length: 93, dtype: object


In [6]:
# Suppression des ponctuations
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Affichage de la colonne text_cleaned après suppression des ponctuations
print("\nAprès suppression des ponctuations :")
print(df['text_cleaned'])


Après suppression des ponctuations :
0     applesupport causing the reply to be disregard...
1     105835 your business means a lot to us please ...
2     76328 i really hope you all change but im sure...
3     105836 livechat is online at the moment  https...
4     virgintrains see attached error message ive tr...
                            ...                        
88    105860 i wish amazon had an option of where i ...
89    they reschedule my shit for tomorrow httpstcor...
90    105861 hey sara sorry to hear of the issues yo...
91    tesco bit of both  finding the layout cumberso...
92    105861 if that doesnt help please dm your full...
Name: text_cleaned, Length: 93, dtype: object


In [7]:
# Suppression des mots vides (stopwords)
stop_words = set(stopwords.words('english'))
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Affichage de la colonne text_cleaned après suppression des mots vides
print("\nAprès suppression des mots vides :")
print(df['text_cleaned'])


Après suppression des mots vides :
0     applesupport causing reply disregarded tapped ...
1     105835 business means lot us please dm name zi...
2            76328 really hope change im sure wont dont
3     105836 livechat online moment httpstcosy94vtu8...
4     virgintrains see attached error message ive tr...
                            ...                        
88    105860 wish amazon option get shipped ups stor...
89          reschedule shit tomorrow httpstcorsvzct982t
90    105861 hey sara sorry hear issues ask lay spee...
91    tesco bit finding layout cumbersome removing i...
92    105861 doesnt help please dm full name address...
Name: text_cleaned, Length: 93, dtype: object


In [8]:
# Suppression des émojis, émoticônes, URL, balises HTML (utilisation d'expressions régulières)
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: re.sub(r'http\S+|www\S+|<[^>]+>', '', x))
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: re.sub(r'[^\w\s]','',x))

# Affichage de la colonne text_cleaned après suppression des émojis, émoticônes, URL, balises HTML
print("\nAprès suppression des émojis, émoticônes, URL, balises HTML :")
print(df['text_cleaned'])


Après suppression des émojis, émoticônes, URL, balises HTML :
0     applesupport causing reply disregarded tapped ...
1     105835 business means lot us please dm name zi...
2            76328 really hope change im sure wont dont
3     105836 livechat online moment  contact 03331 0...
4     virgintrains see attached error message ive tr...
                            ...                        
88    105860 wish amazon option get shipped ups stor...
89                            reschedule shit tomorrow 
90    105861 hey sara sorry hear issues ask lay spee...
91    tesco bit finding layout cumbersome removing i...
92    105861 doesnt help please dm full name address...
Name: text_cleaned, Length: 93, dtype: object


In [9]:
# Suppression des mots extrêmement fréquents
# Compter la fréquence de chaque mot dans le corpus
word_freq = Counter()
df['text_cleaned'].str.split().apply(word_freq.update)

# Déterminer les mots extrêmement fréquents (par exemple, les mots qui apparaissent dans plus de 90% des documents)
total_docs = len(df)
common_words = [word for word, freq in word_freq.items() if freq > total_docs * 0.9]

# Supprimer les mots extrêmement fréquents
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: ' '.join(word for word in x.split() if word not in common_words))

# Affichage de la colonne text_cleaned après suppression des mots extrêmement fréquents
print("\nAprès suppression des mots extrêmement fréquents :")
print(df['text_cleaned'])


Après suppression des mots extrêmement fréquents :
0     applesupport causing reply disregarded tapped ...
1     105835 business means lot us please dm name zi...
2            76328 really hope change im sure wont dont
3     105836 livechat online moment contact 03331 03...
4     virgintrains see attached error message ive tr...
                            ...                        
88    105860 wish amazon option get shipped ups stor...
89                             reschedule shit tomorrow
90    105861 hey sara sorry hear issues ask lay spee...
91    tesco bit finding layout cumbersome removing i...
92    105861 doesnt help please dm full name address...
Name: text_cleaned, Length: 93, dtype: object


In [10]:
# Suppression des mots rares
# Déterminer les mots rares (par exemple, les mots qui apparaissent dans moins de 5 documents)
rare_words = [word for word, freq in word_freq.items() if freq < 5]

# Supprimer les mots rares
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: ' '.join(word for word in x.split() if word not in rare_words))

# Affichage de la colonne text_cleaned après suppression des mots rares
print("\nAprès suppression des mots rares :")
print(df['text_cleaned'])


Après suppression des mots rares :
0                          applesupport
1                     us please dm name
2                                      
3                                  back
4                             ive tried
                    ...                
88                                  get
89                                     
90                                sorry
91                                tesco
92    help please dm name device thanks
Name: text_cleaned, Length: 93, dtype: object


In [11]:
# Stemming
stemmer = PorterStemmer()
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: ' '.join(stemmer.stem(word) for word in x.split()))

# Affichage de la colonne text_cleaned après stemming
print("\nAprès stemming :")
print(df['text_cleaned'])


Après stemming :
0                       applesupport
1                   us pleas dm name
2                                   
3                               back
4                            ive tri
                   ...              
88                               get
89                                  
90                             sorri
91                             tesco
92    help pleas dm name devic thank
Name: text_cleaned, Length: 93, dtype: object


In [12]:
# Ajout de la colonne 'tweet_id' avec des valeurs par défaut pour l'exemple
df['tweet_id'] = range(1, len(df) + 1)

# Réorganiser les colonnes dans l'ordre requis
df = df[['tweet_id', 'text', 'text_cleaned']]

# Affichage du dataframe final
print(df)

    tweet_id                                               text  \
0          1  @AppleSupport causing the reply to be disregar...   
1          2  @105835 Your business means a lot to us. Pleas...   
2          3  @76328 I really hope you all change but I'm su...   
3          4  @105836 LiveChat is online at the moment - htt...   
4          5  @VirginTrains see attached error message. I've...   
..       ...                                                ...   
88        89  @105860 I wish Amazon had an option of where I...   
89        90  They reschedule my shit for tomorrow https://t...   
90        91  @105861 Hey Sara, sorry to hear of the issues ...   
91        92  @Tesco bit of both - finding the layout cumber...   
92        93  @105861 If that doesn't help please DM your fu...   

                      text_cleaned  
0                     applesupport  
1                 us pleas dm name  
2                                   
3                             back  
4         