In [1]:
!pip install nltk
!pip install gensim

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (763 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m763.2/763.2 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.3.15
Collecting gensim
  Downloading gensim-4.1.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 KB[0m [31m3.2 MB/s[0m eta [

In [38]:
import re
import string
import nltk
import pandas as pd
import gensim
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import FrenchStemmer

In [6]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("nlp").getOrCreate()

In [24]:
french_stopwords = stopwords.words('french')
french_stopwords.extend(["Replying to", "Replying", "to", 'replying'])
filtre_stopfr =  lambda text: [token for token in text if token.lower() not in french_stopwords]

def enlever_ponctuation(_str):
  _str = re.sub('['+string.punctuation+']', ' ', _str)
  _str = re.sub('[\r]', ' ', _str)
  _str = re.sub('[\n]', ' ', _str)
  _str = re.sub('[«"]', ' ', _str)
  return _str

def enlever_filler(str):
  return filtre_stopfr(word_tokenize(str))

def extraire_hashtags(str):
    hashtag_list = []
    for word in str.split():
        if word[0] == '#':
            hashtag_list.append(word[1:])
    return hashtag_list

def tokenization(_str):
    return enlever_filler(enlever_ponctuation(_str))

In [25]:
vaccins = pd.read_csv(r'vaccinsFull.csv')
print(vaccins)

      Unnamed: 0                            UserScreenName         UserName  \
0              0                         augustin barbeaux    @barbeauroch5   
1              1                             Bayembi_borny    @BayembiBorny   
2              2                           Merry Socialist  @Socialistesque   
3              3                               incesticide  @apolitique2022   
4              4                          Arkhos94 #vaccin        @arkhos94   
...          ...                                       ...              ...   
3487         257  Francki France N'OUBLIONS PAS LE DONBASS  @france_francki   
3488         258                                   Lorenzo  @SaintExpedit78   
3489         259                  JPB #TouchePasAuxEnfants        @jpb_ufic   
3490         260                             Thierry henry       @grissom91   
3491         261                                    Ⓥ 𝔐𝔞𝔫𝔲      @Emmanimals   

                     Timestamp  \
0     2022-01-01T

In [26]:
hashtags=[]
mot_cles=[]
for _, row in vaccins.iterrows():
    hashtags.append(extraire_hashtags(row.Embedded_text))
    mot_cles.append(tokenization(row.Embedded_text))
    

In [27]:
vaccins = vaccins.assign(hashtags = hashtags)
vaccins = vaccins.assign(mot_cles = mot_cles)

In [40]:
# https://embeddings.net/embeddings/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin la source du modele Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format('frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin', binary=True)

In [41]:
wv.most_similar("père")

[('fils', 0.8278831243515015),
 ('frère', 0.8005436658859253),
 ('mère', 0.77669757604599),
 ('grand-père', 0.7313958406448364),
 ('aîné', 0.7286756038665771),
 ('oncle', 0.7015424966812134),
 ('adoptif', 0.6922641396522522),
 ('beau-père', 0.6607111096382141),
 ('paternel', 0.6535272598266602),
 ('orphelin', 0.6337324976921082)]

In [54]:
def featureVecMethod(words, model):
    # Pre-initialising empty numpy array for speed
    num_features=model.vector_size
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.index_to_key)
    not_in_model = []
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            v = model[word]
            #print(v)
            if np.isnan(v).any():
                print(word, v)
            featureVec = featureVec + model[word]
        else:
            not_in_model.append(word)
    #Here we can see if some of the words are not in the model, if so, we cannot use them for the clustering       
    print(not_in_model)
    # Dividing the result by number of words to get average
    if nwords != 0:
        featureVec = featureVec/nwords
    return featureVec

['DictatureSanitaire', 'DictatureEnMarche', 'Député', 'Parlement', 'Big', 'Pharma', 'Quand', '2']


array([ 0.04826047,  0.07433414, -0.11250746, -0.02080359,  0.05250414,
       -0.07087479, -0.1988162 , -0.0129333 ,  0.08884215,  0.2574999 ,
        0.12002086,  0.09143151,  0.09375057,  0.02924548, -0.03668779,
        0.20636545, -0.26605338,  0.08967759,  0.06466752,  0.02635627,
        0.12066436,  0.00485327, -0.29215553, -0.24607454,  0.46377438,
        0.3324408 , -0.19971551,  0.17380495, -0.0120453 ,  0.10916305,
       -0.07739734,  0.24886501, -0.3419363 , -0.07640602,  0.10175157,
       -0.02358517, -0.20988277,  0.46855783, -0.09642693,  0.43309388,
        0.00420387,  0.13834734,  0.19163741,  0.13250473, -0.23497827,
       -0.36984587, -0.09696947,  0.5264313 ,  0.09964237, -0.03731539,
        0.0195171 ,  0.16780172,  0.10646118,  0.29476306,  0.09516475,
        0.01985354,  0.06143357, -0.22718485,  0.13759986, -0.04698601,
        0.17967995,  0.05498347, -0.1541502 , -0.34252053, -0.10395689,
        0.15924099,  0.03287253, -0.26859742,  0.25759804, -0.26

In [55]:
a = vaccins['mot_cles'][0]+vaccins['hashtags'][0]

['vaccins',
 'DictatureSanitaire',
 'DictatureEnMarche',
 'genocide',
 'Député',
 'roumain',
 'Parlement',
 'européen',
 'expose',
 'contrat',
 'secret',
 'Big',
 'Pharma',
 'Quand',
 'gouvernement',
 'sait',
 'tout',
 'tyrannie',
 'quand',
 'savez',
 'tout',
 'gouvernement',
 'démocratie',
 '2',
 'vaccins',
 'DictatureSanitaire',
 'DictatureEnMarche',
 'genocide']

In [81]:
valeurs_vectorisation_mots=[]
valeurs_vectorisation_hashtags=[]
valeurs_vectorisation=[]
for _, ligne in vaccins.iterrows():
    valeurs_vectorisation.append(featureVecMethod(ligne.mot_cles+ligne.hashtags, wv).tolist())

['DictatureSanitaire', 'DictatureEnMarche', 'Député', 'Parlement', 'Big', 'Pharma', 'Quand', '2', 'DictatureSanitaire', 'DictatureEnMarche']
['Démonstration', 'Dr', 'Helene', 'Rossinot', 'Masque', 'gestesbarrieres', 'distanciationsociale', 'Masque', 'gestesbarrieres', 'distanciationsociale']
['\u2066', 'NetflixFR', '\u2069', 'CobraKai', 'Macron', 'Véran', 'VaccinObligatoire', '’', 'Vaccins', 'NonAuPasseVaccinal', '3', 'CobraKai', 'Véran)', 'Vaccins).', 'NonAuPasseVaccinal']
['voeuxmacron', 'Sans', '’', 'Europe', '’', 'covid19', 'NON', '’', '’', 'Europe', 'Mr', '’', 'voeux2022', 'presidentielle2022', 'cnews', 'OUI', '0', '04', '14', '1', 'voeuxmacron', 'covid19', 'voeux2022', 'presidentielle2022', 'cnews']
['Pfizer', 'Source', 'CETATE', '2', '2', '2', 'Pfizer,', 'moderna...']
['Annee2022', 'gouvernementFR', 'Macron', 'Castex', 'Blanquer', 'TV', 'État', 'CrimesAgainstHumanity', 'GIF', 'ALT', '1', '2', 'Annee2022', 'gouvernementFR', 'Macron', 'Castex', 'Blanquer', 'CrimesAgainstHumanity']

In [83]:
vaccins_traite = vaccins
vaccins_traite = vaccins_traite.assign(valeurs_vectorisation=valeurs_vectorisation)

vaccins_traite.to_csv("vaccinsTraitesFull.csv")

In [86]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, TimestampType, FloatType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("UserScreenName", StringType(), True),
    StructField("UserName", StringType(), True),
    StructField("Timestamp", TimestampType(), True),
    StructField("Text", StringType(), True),
    StructField("Embedded_text", StringType(), True),
    StructField("Emojis", StringType(), True),
    StructField("Comments", StringType(), True),
    StructField("Image link", ArrayType(StringType()), True),
    StructField("Tweet URL", StringType(), True),
    StructField("hashtags", ArrayType(StringType()), True),
    StructField("mot_cles", ArrayType(StringType()), True),
    StructField("valeurs_vectorisation", ArrayType(FloatType()), True)])

vaccins_spark = spark.read.csv('vaccinsTraitesFull.csv', inferSchema=True)
vaccins_spark.printSchema()
vaccins_spark.show()

# On ne peut pas utiliser PySpark a cause de problemes de compatibilité avec les array 

AttributeError: 'DataFrame' object has no attribute 'read'

In [94]:
vaccins_traite[['mot_cles','valeurs_vectorisation']][0:10]

Unnamed: 0,mot_cles,valeurs_vectorisation
0,"[vaccins, DictatureSanitaire, DictatureEnMarch...","[-0.017874334007501602, -0.011175374500453472,..."
1,"[ère, bonne, résolution, cette, nouvelle, anné...","[0.08690714836120605, 0.22452007234096527, 0.3..."
2,"[pense, ⁦, NetflixFR, ⁩, CobraKai, raison, Mac...","[-0.20400500297546387, 0.18714751303195953, -0..."
3,"[voeuxmacron, fake, fake, fake, news, Sans, ’,...","[-0.08908065408468246, -0.23232945799827576, 0..."
4,"[petit, cadeau, tout, ceux, croient, vaccins, ...","[-0.033415451645851135, -0.2506740391254425, 0..."
5,"[Annee2022, année, procès, gouvernementFR, Mac...","[0.006658188067376614, 0.4478951096534729, 0.0..."
6,"[Audition, Sénat, RatignierCarbonneil, directr...","[0.23884639143943787, 0.046073608100414276, 0...."
7,"[wargonm, quelle, après, cette, supposée, atti...","[-0.14385326206684113, 0.10333363711833954, 0...."
8,"[ceux, détestent, NonVaccinés, disent, PasseVa...","[0.14957231283187866, 0.07924788445234299, -0...."
9,"[COVID19, corruption, arguments, favorables, v...","[-0.036533914506435394, -0.015814892947673798,..."


ERROR! Session/line number was not unique in database. History logging moved to new session 12


In [None]:
# On utilise une autre librairie pour clusteriser avec KMEANS
from sklearn.cluster import KMeans

km = pd.DataFrame()
#les clusters
NUM_CLUSTERS=5
kmeans = KMeans(NUM_CLUSTERS, random_state=0)
km['cluster']=kmeans.fit_predict(vaccins_traite[['Unnamed: 0','valeurs_vectorisation']])

#les centroides
centroids = kmeans.cluster_centers_
cen_x = [i[0] for i in centroids] 
cen_y = [i[1] for i in centroids]
km['cen_x'] = df.cluster.map({0:cen_x[0], 1:cen_x[1], 2:cen_x[2], 3:cen_x[3], 4:cen_x[4]})
km['cen_y'] = df.cluster.map({0:cen_y[0], 1:cen_y[1], 2:cen_y[2], 3:cen_y[3], 4:cen_y[4]})
# define and map colors
colors = ['#DF2020', '#81DF20', '#2095DF', '#B0DB43', '#FFE4B5']
km['c'] = df.cluster.map({0:colors[0], 1:colors[1], 2:colors[2], 3:colors[3], 4:colors[4]})

In [None]:
import matplotlib.pyplot as plt
plt.scatter(vaccins_traite.index, vaccins_traite.valeurs_vectorisation, c=km.c, alpha = 0.6, s=10)