In [138]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, col, concat_ws, udf, lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
from pyspark.sql.types import StringType,StructType, ArrayType
from nltk.stem.snowball import SnowballStemmer

In [44]:
spark = SparkSession.builder \
                    .appName("Model_studio") \
                    .getOrCreate()

In [175]:
df = spark.read.csv("./courses.csv", header=True)

In [176]:
df.show(5)

+------------+-------------------+----------------------+--------------------+--------------------+
|ID_FORMATION| DOMAINE_CATGEGORIE|SOUS_DOMAINE_CATEGORIE|     FORMATION_COURS|            KEYWORDS|
+------------+-------------------+----------------------+--------------------+--------------------+
|         230|Commercial - Ventes|  Vente et négociation|L’expérience clie...|Fidélisation clie...|
|         231|Commercial - Ventes|  Vente et négociation|Réussir sa relati...|Relation client, ...|
|         232|Commercial - Ventes|  Vente et négociation|L'essentiel de la...|Négociation comme...|
|         234|Commercial - Ventes|  Vente et négociation|Négociation comme...|Négociation, vent...|
|         238|Commercial - Ventes|  Vente et négociation|Répondre à un app...|Négociation comme...|
+------------+-------------------+----------------------+--------------------+--------------------+
only showing top 5 rows



In [177]:
# concatenation !
columns = df.columns
df = df.withColumn("Full_Description", concat_ws(" " ,*[col(c) for c in df.columns if c != "ID_FORMATION"]))

In [178]:
df.select(df.Full_Description).show(2,truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Full_Description                                                                                                                                                                                                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Commercial -

In [179]:
df = df.withColumn("Full_Description", lower(df["Full_Description"]))

In [180]:
df.select(df.ID_FORMATION ,df.Full_Description).show(2,truncate=False)

+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ID_FORMATION|Full_Description                                                                                                                                                                                                                                                                                                                      |
+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [181]:
df = df.withColumn(
    "Full_Description",
    regexp_replace(
        regexp_replace(df["Full_Description"], "[.,:-]", ""),  # Supprimer les virgules, points, deux-points
        "\\s+", " "  # Remplacer les espaces multiples par un seul espace
    )
)

In [182]:
df.select(df.ID_FORMATION ,df.Full_Description).show(2,truncate=False)

+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ID_FORMATION|Full_Description                                                                                                                                                                                                                                                                                              |
+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|230         |commercial ventes vente et négoc

In [183]:

french_stopwords_nltk = stopwords.words('french')

In [184]:
tokenizer = Tokenizer(inputCol="Full_Description", outputCol="Tokenized_Full_Description")
stop_words_remover = StopWordsRemover(inputCol="Tokenized_Full_Description", outputCol="filtered_words", stopWords=french_stopwords_nltk)

In [185]:
pipeline = Pipeline(stages=[tokenizer, stop_words_remover])
model = pipeline.fit(df)
result = model.transform(df)

In [186]:
# racine de mots : stemming ! 
def stem_word(words):
    stemmer = SnowballStemmer("french")
    if isinstance(words, list):  # Si c'est une liste
        return [stemmer.stem(word) for word in words]  # Appliquer le stemming sur chaque mot
    return stemmer.stem(words) if words else None  # Si ce n'est pas une liste, appliquer sur un seul mot
stem_udf = udf(stem_word, ArrayType(elementType=StringType()))

In [187]:
result = result.withColumn("stemmed_words", stem_udf(result["filtered_words"]))

##NBR OF WORDS CHOICE

In [188]:
from pyspark.sql.functions import explode, split

# Diviser les mots et compter les termes uniques
word_count = result.select(explode(result["stemmed_words"]).alias("word")) \
               .groupBy("word") \
               .count() \
               .orderBy("count", ascending=False)

# Nombre de termes uniques
num_unique_terms = word_count.count()
print("Nombre de termes uniques :", num_unique_terms)

[Stage 81:>                                                         (0 + 1) / 1]

Nombre de termes uniques : 3527


                                                                                

In [189]:
word_count.show(20)

[Stage 87:>                                                         (0 + 1) / 1]

+-------------+-----+
|         word|count|
+-------------+-----+
|      gestion| 2162|
|       commun| 1463|
|    marketing| 1324|
|      digital| 1261|
|        manag| 1208|
|       projet|  987|
|      maîtris|  969|
|      travail|  935|
|professionnel|  848|
|     développ|  835|
|     collabor|  771|
|       public|  761|
|       format|  728|
|        outil|  728|
|       social|  699|
|        googl|  681|
|    personnel|  622|
|      publiqu|  612|
|      perform|  575|
|      efficac|  573|
+-------------+-----+
only showing top 20 rows



                                                                                

In [190]:
#applying the tfidf
hashing_tf = HashingTF(inputCol="stemmed_words", outputCol="raw_features", numFeatures=1600)
idf = IDF(inputCol="raw_features", outputCol="features")

In [191]:
pipeline = Pipeline(stages=[hashing_tf, idf])
model = pipeline.fit(result)
result = model.transform(result)

                                                                                

In [206]:
result.select(result.ID_FORMATION, result.features).sort("ID_FORMATION").show(10, truncate=False)

[Stage 108:>                                                        (0 + 1) / 1]

+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ID_FORMATION|features                                                                                                                                                                                                                                                                                                                                                                      

                                                                                

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aur