In [1]:
# Data wrangling
import pandas as pd
pd.options.display.max_colwidth = 3000
from unidecode import unidecode
import re

# Text
import nltk  
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
set(stopwords.words('french'))

# NLP
import spacy.cli
nlp = spacy.load("fr_core_news_sm")

## Read and preprocess data

In [2]:
corpus_complet = pd.read_csv('corpus_FE.csv')

In [3]:
print('Total number of records in 2011-2021 dataset: ', corpus_complet.shape)

Total number of records in 2011-2021 dataset:  (47162, 9)


In [4]:
pd.options.display.max_colwidth = 3000

## Word usage 

#### Preparing text for the analysis: lowercase, removing stopwords, special characters and numbers, lemmatisation

In [26]:
# transfroming to string
corpus_complet['text_as_string'] = corpus_complet['content'].astype(str) 

In [29]:
stop_words = open('stopwords-fr.txt','r', encoding="utf-8").read().split('\n')

In [30]:
def clean_text(text):
    # remove accents of text
    text=unidecode(text)
    # lowercase
    text=text.lower()
    text = re.sub(r'\d+', '', text) # remove numbers
    # characters
    text=re.sub(r'[^\sa-zA-Z0-9@\[\]]',' ', text) # remove characters: punctuation and other special characters
    text=re.sub(r"https*\S+", '', text) # remove links
    text = re.sub(r"\sd\s", " ", text)
    text = re.sub(r"\sf\s", " ", text)
    text = re.sub(r"\si\s", " ", text)
    text = re.sub(r"\sm\s", " ", text)
    text = re.sub(r"\ss\s", " ", text)
    text = re.sub(r"\st\s", " ", text)
    text = re.sub(r"\su\s", " ", text)
    text = re.sub(r"\sy\s", " ", text)
    text = re.sub(r"\sl\s", " ", text)
    text = re.sub(r"\sc\s", " ", text)
    text = re.sub(r"\sa\s", " ", text)
    text = re.sub(r"\sn\s", " ", text)
    text = re.sub(r"\sr\s", " ", text)
    text = re.sub(r"\se\s", " ", text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    text = " ".join([word for word in text.split()])
    # remove stopwords
    return text

In [31]:
corpus_complet['cleaned_text'] = corpus_complet.text_as_string.apply(lambda x: clean_text(x))

In [32]:
corpus_complet['cleaned_text'].head(20)

0     supplement digital nouvelle star xxie siecle elon musk croit capable declencher troisieme guerre mondiale environnement medecine armement partout intelligence artificielle affole partout imperturbable microcosme creation publicitaire putsch demain president betc stephane xiberras exemple question effets annonce joue amalgame veritable intelligence artificielle capable penser automatisation taches analyse donnees instant jamais permis creer constat approuve francois brogi vp creation agence data artefact crois pouvoir emotion surprise audace traits ia suffisamment dotee puisqu base apprentissage existant jean philippe desbiolles vice president ibm watson tranche question considerant saut creatif bel humain actuellement maturite agir systemes supervises crois complementarite createur ia devenant formidable stimulant maniere repondre besoins hyperpersonnalisation consommateur crise confiance outils soutien ia generer acte creatif soutenir aide effective ensemble chaine commencer ins

In [33]:
corpus_complet.to_csv('corpus_clean.csv')

## Visualization of the word usage

In [34]:
mylist = []
for string in corpus_complet['cleaned_text']:
    mylist.append(string)
mylist

new_text = str(mylist)

In [35]:
# Tokenization
txt_tokens = word_tokenize(new_text)
txt_tokens = [word.lower() for word in txt_tokens if word.isalpha()]
txt_tokens = [word for word in txt_tokens if not word in stop_words]

In [36]:
# Creation of the frequency list
fdist = FreqDist(txt_tokens)

In [37]:
# Frequency Distribution Plot
df_fdist = pd.DataFrame(fdist.items(), columns=['word', 'frequency'])

#top 30 words 
df_fdist30 = df_fdist.sort_values(['frequency'], ascending=False).head(50)
df_fdist30

Unnamed: 0,word,frequency
17,intelligence,74925
18,artificielle,69147
60,ia,53336
41,donnees,47891
644,ete,34386
240,faire,33356
139,france,30334
609,entreprises,25435
174,monde,24840
370,ans,24080


## Bigrams

In [125]:
# Creation of the bigrams dataframe
bigrams_series = (pd.Series(nltk.ngrams(txt_tokens, 2)).value_counts())
bigrams = pd.DataFrame(bigrams_series.sort_values(ascending=False))
bigrams = bigrams_top.reset_index().rename(columns={'index': 'bigram', 0:'count'})


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [126]:
bigrams_top.head(20)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,bigram,count
0,"(intelligence, artificielle)",68015
1,"(start, up)",16752
2,"(etats, unis)",10105
3,"(machine, learning)",6933
4,"(millions, euros)",6572
5,"(artificielle, ia)",5014
6,"(temps, reel)",4951
7,"(big, data)",4771
8,"(chiffre, affaires)",4344
9,"(reseaux, sociaux)",4048


In [130]:
bigrams['bigram'] = bigrams['bigram'].astype(str)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [134]:
bigrams['bigram'] = bigrams['bigram'].str.replace("(", '')
bigrams'bigram'] = bigrams['bigram'].str.replace(")", '')
bigrams['bigram'] = bigrams['bigram'].str.replace(",", '_')
bigrams['bigram'] = bigrams['bigram'].str.replace(" ", '')
bigrams['bigram'] = bigrams['bigram'].str.replace("'", '')


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.



In [137]:
bigrams = bigrams.drop([bigrams.index[0] , bigrams.index[5]])


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [141]:
fig = px.bar(bigrams[:40], x='count', y='bigram', title='Counts of top bigrams', template='plotly_white')
fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



## Additional cleaning - Text lemmatization

In [40]:
corpus_sample = corpus_complet.sample(n = 200)

In [46]:
mylist = []
for string in corpus_complet['cleaned_text']:
    mylist.append(string)
mylist

new_text = str(mylist)

In [42]:
doc = nlp(new_text)

In [None]:
corpus_complet['lemmatized'] = corpus_complet.cleaned_text.apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))

In [47]:
corpus_sample['lemmatized'] = corpus_sample.cleaned_text.apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))

In [45]:
pd.options.display.max_colwidth = 100
print(corpus_sample[['content', 'cleaned_text','lemmatized']].head(5))

                                                                                                   content  \
32298  Le Carré Michelet à La Défense, détenu par Gecina    On rembobine les actus de la semaine. Comme...   
5624   L'IA, un nouvel arrivant qui dérange L'adoption de l'IA soulève plusieurs challenges. Tout d'abo...   
17157  Après l'expert rennais de la chimie verte Demeta, lundi, c'est la start-up grenobloise TiHive, s...   
15106  1Gagner du temps grâce à Locat'me Locat'me, qui fonctionne comme un site de rencontres, met en r...   
2331   Cet article est issu du magazine Sciences et Avenir n°880, daté juin 2020  Covid-19 : ce que l'o...   

                                                                                              cleaned_text  \
32298  carre michelet defense detenu gecina rembobine actus semaine vendredis cfnews immo infra passe r...   
5624   ia nouvel arrivant derange adoption ia souleve challenges idee opinion humain peur superiorite r...   
17157  ex