# Imports

In [1]:
# OS & env
import os
import dill as pickle
import logging

# ML
from gensim.models import Word2Vec
import nltk

# home made functions
from src.scrap_and_clean import preprocess_doc
from src.models import w2v_vect_data
from src.models import lr_predict_tags


# logging configuration (see all outputs, even DEBUG or INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Load data and models

In [2]:
# nltk downloads
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [3]:
VECTORIZER_URI = "models/w2v_cbow_vectorizer"
CLASSIFIER_URI = "models/w2v_cbow_lrovr_classifier.pkl"
KEEP_SET_URI = "data/keep_set.pkl"
EXCLUDE_SET_URI = "data/exclude_set.pkl"

# load vectorizer
logging.info(f"⚙️ Loading vectorizer...")
if os.path.exists(VECTORIZER_URI):
    vectorizer = Word2Vec.load(VECTORIZER_URI)
    logging.info(f"✅ Vectorizer loaded")
else:
    logging.warning(f"⚠️ No vectorizer found ⚠️")

# load classifier
logging.info(f"⚙️ Loading classifier...")
if os.path.exists(CLASSIFIER_URI):
    with open(CLASSIFIER_URI, "rb") as f:
        classifier = pickle.load(f)
    logging.info(f"✅ Classifier loaded")
else:
    logging.warning(f"⚠️ No classifier found ⚠️")

# load keep set (for preprocessing)
logging.info(f"⚙️ Loading keep set...")
if os.path.exists(KEEP_SET_URI):
    with open(KEEP_SET_URI, "rb") as f:
        keep_set = pickle.load(f)
    logging.info(f"✅ Keep set loaded")
else:
    logging.warning(f"⚠️ No keep set found ⚠️")

# load keep set (for preprocessing)
logging.info(f"⚙️ Loading exclude set...")
if os.path.exists(EXCLUDE_SET_URI):
    with open(EXCLUDE_SET_URI, "rb") as f:
        exclude_set = pickle.load(f)
    logging.info(f"✅ Exclude set loaded")
else:
    logging.warning(f"⚠️ No exclude set found ⚠️")

INFO:root:⚙️ Loading vectorizer...
INFO:gensim.utils:loading Word2Vec object from models/w2v_cbow_vectorizer
INFO:gensim.utils:loading wv recursively from models/w2v_cbow_vectorizer.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': 'models/w2v_cbow_vectorizer', 'datetime': '2024-06-10T15:01:04.611649', 'gensim': '4.3.2', 'python': '3.11.6 (main, Mar 19 2024, 19:27:13) [GCC 11.4.0]', 'platform': 'Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}
INFO:root:✅ Vectorizer loaded
INFO:root:⚙️ Loading classifier...
INFO:root:✅ Classifier loaded
INFO:root:⚙️ Loading keep set...
INFO:root:✅ Keep set loaded
INFO:root:⚙️ Loading exclude set...
INFO:root:✅ Exclude set loaded


user input

In [4]:
usr_input_title = "pandas merge with Python >3.5"
usr_input_body = """How can I perform a (INNER| (LEFT|RIGHT|FULL) OUTER) JOIN with pandas?
How do I add NaNs for missing rows after a merge? How do I get rid of NaNs after merging?
Can I merge on the index? How do I merge multiple DataFrames?
I've seen these recurring questions asking about various facets of the pandas merge functionality, the aim here is to collate some of the more important points for posterity.
"""

In [5]:
usr_input = usr_input_title + "\n" + usr_input_body

# 🚧 MAKE FUNCTION TO CHECK INPUT FIRST (data must be at least 2 words long, not punctuation:
# preciser "too many frequent words" ou "balises HTML supprimées" ou "modèle entraîné sur de l'anglais"...

# def preprocess_doc(document, keep_set, exclude_set) -> str:
#     🚧 packages used -> re, nltk
#     🚧 regrouper fonctions en une seule
#     🚧 include keep_set and exclude_set in function
#     doc_clean = clean_string(document)
#     doc_tokens = tokenize_str(doc_clean, keep_set, exclude_set)
#     doc_lemmed = lemmatize_tokens(doc_tokens, keep_set, exclude_set)
#     doc_tk_clean = clean_tokens(doc_lemmed, keep_set, exclude_set)
#     doc_preprocessed = " ".join(doc_tk_clean)

#     return doc_preprocessed

input_clean = preprocess_doc(usr_input, keep_set, exclude_set)

# 🚧 supprimer les outputs : les supprimer des fonctions
print("user input:", usr_input)
print("\nclean input:", input_clean)

[nltk_data] Downloading package punkt to /home/jl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


user input: pandas merge with Python >3.5
How can I perform a (INNER| (LEFT|RIGHT|FULL) OUTER) JOIN with pandas?
How do I add NaNs for missing rows after a merge? How do I get rid of NaNs after merging?
Can I merge on the index? How do I merge multiple DataFrames?
I've seen these recurring questions asking about various facets of the pandas merge functionality, the aim here is to collate some of the more important points for posterity.


clean input: pandas merge python perform inner| left|right|full outer join pandas add nan missing row merge get rid nan merging merge index merge multiple dataframes seen recurring asking various facet pandas merge functionality aim collate important point posterity


In [6]:
X_vect = w2v_vect_data(vectorizer, [input_clean.split(" ")])
print(X_vect.shape)
print(X_vect[0])

(1, 50)
[ 0.24701661  0.07568936  0.00817247  0.17902239 -0.3862364  -0.12290046
  0.77592105  0.67494017 -0.6486079  -0.18690905  0.6684604  -0.29240564
 -0.5170923  -0.07515918  0.29629079  0.3047732   0.06215124 -0.14672922
 -0.22475429  0.44185334 -0.10042778 -0.04612633  0.6718335  -0.8425775
  0.62273175  0.4572078   0.6682157   0.0265869  -0.27166218  0.36964867
  0.4941888  -0.20693408 -0.4337857  -0.53141826 -0.24924181 -0.311
 -0.30004004  0.15352508  0.3684115  -0.4648135   0.26534975  0.7629548
 -0.5578143   0.15624113 -0.13686384  0.02493998 -0.06367211 -0.18456039
 -0.56151146  0.88944703]


In [7]:
# predicted_probas = classifier.predict_proba(X_vect)
lr_preds = lr_predict_tags(classifier, X_vect)
predictions = str.join(" ", lr_preds)
print("Predictions:", predictions)

Predictions: pandas python dataframe github git
