# Notebook pour réaliser le scoring des commentaires

* __Description__: Notebook pour le scoring des commentaires avec l'algorithme jigsaw
* __Source__: Table comment (BigQuery) 
* __Output__: Table comment (BigQuery) 
* __Auteur__: Thomas GOBIN et Camille MATTHIEU
* __Date de création__: 15/09/2022
* __Date de mise à jour__: 15/09/2022

## Import des outils

In [1]:
%run /notebook/Libs/scoring_lib.ipynb
%run /notebook/Libs/requete_lib.ipynb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Déclaration des variables

In [2]:
path_model_global = "/model_ml/model_global"
path_model_toxic = "/model_ml/model_toxic"
path_model_severe_toxic = "/model_ml/model_severe_toxic"
path_model_obscene = "/model_ml/model_obscene"
path_model_threat = "/model_ml/model_threat"
path_model_insult = "/model_ml/model_insult"
path_model_identity_hate = "/model_ml/model_identity_hate"
path_vec_global = "/model_ml/vec_global"
path_vec_toxic = "/model_ml/vec_toxic"
path_vec_severe_toxic = "/model_ml/vec_severe_toxic"
path_vec_obscene = "/model_ml/vec_obscene"
path_vec_threat = "/model_ml/vec_threat"
path_vec_insult = "/model_ml/vec_insult"
path_vec_identity_hate = "/model_ml/vec_identity_hate"

## Scoring

Cette fonction permet de calculer le score des contenus de reddit

In [3]:
def scoring():
    """
    Fonction qui va calculer le score global des contenus de reddit
    """
    
    # Récupération des données à analyser
    sql = f"SELECT id_comment, id_post, content, type_content  FROM `.dwh.comment` WHERE extraction_utc = '{date_extraction}';"
    df = client.query(sql).to_dataframe()

    # Récupération des vecteurs
    vec_global = load_model(path_vec_global)
    vec_toxic = load_model(path_vec_toxic)
    vec_severe_toxic = load_model(path_vec_severe_toxic)
    vec_obscene = load_model(path_vec_obscene)
    vec_threat = load_model(path_vec_threat)
    vec_insult = load_model(path_vec_insult)
    vec_identity_hate = load_model(path_vec_identity_hate)
    
    table = "dwh.score_jigsaw"
    
    # Modèle global
    if model_exist(path_model_global):
        print("Model global")
        # Récupération du modèle
        model = load_model(path_model_global) 
        # Application du modèle
        df_global = apply_model(df, model, vec_global, table)
    else:
        # Entrainement du modèle
        create_model()
        # Récupération du modèle
        model = load_model(path_model_global) 
        # Application du modèle
        df_global = apply_model(df, model, vec_global, table)
    
    # Modèle toxic
    if model_exist(path_model_toxic):
        print("Model toxic")
        # Récupération du modèle
        model = load_model(path_model_toxic) 
        # Application du modèle
        df_toxic = apply_model(df, model, vec_toxic, table)
    else:
        # Entrainement du modèle
        create_model()
        # Récupération du modèle
        model = load_model(path_model_toxic) 
        # Application du modèle
        df_toxic = apply_model(df, model, vec_toxic, table)
    
    # Modèle severe toxic
    if model_exist(path_model_severe_toxic):
        print("Model severe toxic")
        # Récupération du modèle
        model = load_model(path_model_severe_toxic) 
        # Application du modèle
        df_severe_toxic = apply_model(df, model, vec_severe_toxic, table)
    else:
        # Entrainement du modèle
        create_model()
        # Récupération du modèle
        model = load_model(path_model_severe_toxic) 
        # Application du modèle
        df_severe_toxic = apply_model(df, model, vec_severe_toxic, table)
    
    # Modèle obscene
    if model_exist(path_model_obscene):
        print("Model obscene")
        # Récupération du modèle
        model = load_model(path_model_obscene) 
        # Application du modèle
        df_obscene = apply_model(df, model, vec_obscene, table)
    else:
        # Entrainement du modèle
        create_model()
        # Récupération du modèle
        model = load_model(path_model_obscene) 
        # Application du modèle
        df_obscene = apply_model(df, model, vec_obscene, table)
    
    # Modèle threat
    if model_exist(path_model_threat):
        print("Model threat")
        # Récupération du modèle
        model = load_model(path_model_threat) 
        # Application du modèle
        df_threat = apply_model(df, model, vec_threat, table)
    else:
        # Entrainement du modèle
        create_model()
        # Récupération du modèle
        model = load_model(path_model_threat) 
        # Application du modèle
        df_threat = apply_model(df, model, vec_threat, table)
    
    # Modèle insult
    if model_exist(path_model_insult):
        print("Model insult")
        # Récupération du modèle
        model = load_model(path_model_insult) 
        # Application du modèle
        df_insult = apply_model(df, model, vec_insult, table)
    else:
        # Entrainement du modèle
        create_model()
        # Récupération du modèle
        model = load_model(path_model_insult) 
        # Application du modèle
        df_insult = apply_model(df, model, vec_insult, table)
    
    # Modèle identity hate
    if model_exist(path_model_identity_hate):
        print("Model identity hate")
        # Récupération du modèle
        model = load_model(path_model_identity_hate) 
        # Application du modèle
        df_identity_hate = apply_model(df, model, vec_identity_hate, table)
    else:
        # Entrainement du modèle
        create_model()
        # Récupération du modèle
        model = load_model(path_model_identity_hate) 
        # Application du modèle
        df_identity_hate = apply_model(df, model, vec_identity_hate, table)
    
    # Suppression et renommage des colonnes
    df_global.rename(columns = {'score_jigsaw':'score_global'}, inplace = True)
    del df_toxic['id_comment']
    del df_toxic['id_post']
    del df_toxic['type_content']
    df_toxic.rename(columns = {'score_jigsaw':'score_toxic'}, inplace = True)
    del df_severe_toxic['id_comment']
    del df_severe_toxic['id_post']
    del df_severe_toxic['type_content']
    df_severe_toxic.rename(columns = {'score_jigsaw':'score_severe_toxic'}, inplace = True)
    del df_obscene['id_comment']
    del df_obscene['id_post']
    del df_obscene['type_content']
    df_obscene.rename(columns = {'score_jigsaw':'score_obscene'}, inplace = True)
    del df_threat['id_comment']
    del df_threat['id_post']
    del df_threat['type_content']
    df_threat.rename(columns = {'score_jigsaw':'score_threat'}, inplace = True)
    del df_insult['id_comment']
    del df_insult['id_post']
    del df_insult['type_content']
    df_insult.rename(columns = {'score_jigsaw':'score_insult'}, inplace = True)
    del df_identity_hate['id_comment']
    del df_identity_hate['id_post']
    del df_identity_hate['type_content']
    df_identity_hate.rename(columns = {'score_jigsaw':'score_identity_hate'}, inplace = True)
    
    # Fusion des résultats
    df_global = df_global.merge(df_toxic, left_index=True, right_index=True)
    df_global = df_global.merge(df_severe_toxic, left_index=True, right_index=True)
    df_global = df_global.merge(df_obscene, left_index=True, right_index=True)
    df_global = df_global.merge(df_threat, left_index=True, right_index=True)
    df_global = df_global.merge(df_insult, left_index=True, right_index=True)
    df_global = df_global.merge(df_identity_hate, left_index=True, right_index=True)
    
    print(df_global.columns.values)
    
    # Insertion des données dans BigQuery
    insertion_bigquery("dwh", "score_jigsaw", df_global)
    