# Imports

In [1]:
# OS & env
import yaml
import os
import dill as pickle
import logging

# ML
from gensim.models import Word2Vec
import nltk

# home made functions
from src.scrap_and_clean import preprocess_doc
from src.models import w2v_vect_data
from src.models import lr_predict_tags


# logging configuration (see all outputs, even DEBUG or INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Load data and models

In [2]:
# 🚧 foutre tout ça à l'init de l'app
# nltk downloads
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [3]:
VECTORIZER_URI = "models/w2v_cbow_vectorizer"
CLASSIFIER_URI = "models/w2v_cbow_lrovr_classifier.pkl"
KEEP_SET_URI = "data/keep_set.pkl"
EXCLUDE_SET_URI = "data/exclude_set.pkl"

# load vectorizer
logging.info(f"⚙️ Loading vectorizer...")
if os.path.exists(VECTORIZER_URI):
    vectorizer = Word2Vec.load(VECTORIZER_URI)
    logging.info(f"✅ Vectorizer loaded")
else:
    logging.warning(f"⚠️ No vectorizer found ⚠️")

# load classifier
logging.info(f"⚙️ Loading classifier...")
if os.path.exists(CLASSIFIER_URI):
    with open(CLASSIFIER_URI, "rb") as f:
        classifier = pickle.load(f)
    logging.info(f"✅ Classifier loaded")
else:
    logging.warning(f"⚠️ No classifier found ⚠️")

# load keep set (for preprocessing)
logging.info(f"⚙️ Loading keep set...")
if os.path.exists(KEEP_SET_URI):
    with open(KEEP_SET_URI, "rb") as f:
        keep_set = pickle.load(f)
    logging.info(f"✅ Keep set loaded")
else:
    logging.warning(f"⚠️ No keep set found ⚠️")

# load keep set (for preprocessing)
logging.info(f"⚙️ Loading exclude set...")
if os.path.exists(EXCLUDE_SET_URI):
    with open(EXCLUDE_SET_URI, "rb") as f:
        exclude_set = pickle.load(f)
    logging.info(f"✅ Exclude set loaded")
else:
    logging.warning(f"⚠️ No exclude set found ⚠️")

INFO:root:⚙️ Loading vectorizer...
INFO:gensim.utils:loading Word2Vec object from models/w2v_cbow_vectorizer
INFO:gensim.utils:loading wv recursively from models/w2v_cbow_vectorizer.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': 'models/w2v_cbow_vectorizer', 'datetime': '2024-06-08T00:50:19.488271', 'gensim': '4.3.2', 'python': '3.11.6 (main, Mar 19 2024, 19:27:13) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}
INFO:root:✅ Vectorizer loaded
INFO:root:⚙️ Loading classifier...
INFO:root:✅ Classifier loaded
INFO:root:⚙️ Loading keep set...
INFO:root:✅ Keep set loaded
INFO:root:⚙️ Loading exclude set...
INFO:root:✅ Exclude set loaded


user input

In [4]:
usr_input_title = "python numpy issue: how to append arrays"
usr_input_body = """
hello everybody,
i'm new here and i have a beginner problem, more precisely a python numpy problem:
how can i append 2 <bold> <python> np.arrays?
"""

In [5]:
usr_input = usr_input_title + " " + usr_input_body

# 🚧 MAKE FUNCTION TO CHECK INPUT FIRST (data must be at least 2 words long, not punctuation:
# preciser "too many frequent words" ou "balises HTML supprimées" ou "modèle entraîné sur de l'anglais"...

# def preprocess_doc(document, keep_set, exclude_set) -> str:
#     🚧 packages used -> re, nltk
#     🚧 regrouper fonctions en une seule
#     🚧 include keep_set and exclude_set in function
#     doc_clean = clean_string(document)
#     doc_tokens = tokenize_str(doc_clean, keep_set, exclude_set)
#     doc_lemmed = lemmatize_tokens(doc_tokens, keep_set, exclude_set)
#     doc_tk_clean = clean_tokens(doc_lemmed, keep_set, exclude_set)
#     doc_preprocessed = " ".join(doc_tk_clean)

#     return doc_preprocessed

input_clean = preprocess_doc(usr_input, keep_set, exclude_set)

# 🚧 supprimer les outputs :
# import nltk
# nltk.download('punkt', quiet=True)
# nltk.download('wordnet', quiet=True)
# - faire ces downloads dès le lancement de l'appli (pas dans les fonctions)
# - les supprimer des fonctions

print("user input:", usr_input)
print("\nclean input:", input_clean)

[nltk_data] Downloading package punkt to /home/jl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


user input: python numpy issue: how to append arrays 
hello everybody,
i'm new here and i have a beginner problem, more precisely a python numpy problem:
how can i append 2 <bold> <python> np.arrays?


clean input: python numpy append arrays hello everybody beginner precisely python numpy append np.arrays


In [6]:
X_vect = w2v_vect_data(vectorizer, [input_clean.split(" ")])
print(X_vect.shape)
print(X_vect[0])

(1, 50)
[ 0.59756595  0.03122842 -0.38710928 -1.0459322  -1.8094747  -3.3661306
 -1.1318852   1.7912159  -2.521063   -1.5359498   1.3933053  -1.0525324
  0.6660066   0.7565771   0.5406193   0.7922543  -0.01575549 -0.7342564
  1.2939134   0.05723165  2.2466464   0.11878499  0.9928756  -1.2111318
  1.11469     0.35285315  0.8928529   0.09151103 -0.41830072  1.1391151
 -1.6673391  -1.8090811  -0.5607196   0.42129993 -0.5243376  -1.0007704
 -0.58788896 -0.4786713   0.28627938 -0.7510565  -0.12019976  1.2245889
 -0.4200853   0.67028254 -1.1711799  -1.0783579   0.752281   -0.79137856
 -0.30207184  1.83658   ]


In [7]:
predicted_probas = classifier.predict_proba(X_vect)
lr_preds = lr_predict_tags(classifier, X_vect)
predictions = str.join(" ", lr_preds)
print("Predictions:", predictions)

Predictions: pandas dataframe arrays python numpy
