# Articulo's Family Classifier


In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load Data

In [35]:
data_version = '2024-05-13'
data_path = f'../DATA/processed/{data_version}/text_to_analyse.csv'
text_to_analyse = pd.read_csv(data_path, sep='¬', encoding='utf-8-sig')

  text_to_analyse = pd.read_csv(data_path, sep='¬', encoding='utf-8-sig')


### Data Preprocessing and Model Training

In [5]:
import spacy
import pandas as pd
import multiprocessing
nlp = spacy.load('es_core_news_sm')
# nlp = spacy.load('es_core_news_md')
# nlp = spacy.load('es_core_news_sm')
import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
def preprocess_text(docs):
    texts = [doc for doc in nlp.pipe(docs, disable=["ner", "parser"])] 
    processed_texts = []
    for doc in texts:
        tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
        processed_texts.append(' '.join(tokens))
    return processed_texts

In [28]:
corpus = pd.read_csv('../DATA/processed/2024-05-13/corpus.csv')
corpus['processed_text'] = preprocess_text(corpus['text_to_analyse'])
corpus.head()

Unnamed: 0,text_to_analyse,processed_text
0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,indican exclusivamente estropeado iban instala...
1,NO FUNCIONA NO FUNCIONA AZC3TACTOCSB,funciona funciona azc3tactocsb
2,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,azatactorsb quedado bloqueado permite encender...
3,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,pulsadores subir persianas bajar comprobado en...
4,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",rejilla motorizada defectuosa sustituir rafael...


In [30]:
common_kwargs = dict(
    vector_size=200, 
    epochs=20, 
    min_count=2,
    sample=0, 
    workers=multiprocessing.cpu_count(), 
    negative=5, 
    hs=0,
    seed=0
)

class CommentedDoc2Vec(Doc2Vec):
    def __init__(self, comment="", **kwargs):
        super().__init__(**kwargs)
        self.comment = comment
        
# PV-DBOW plain
model = CommentedDoc2Vec(
    dm=0, 
    comment=f"PV-DBOW-"
            f"v_size {common_kwargs["vector_size"]}-"
            f"epochs {common_kwargs['epochs']}-"
            f"hs {common_kwargs['hs']}-"
            f"sample {common_kwargs['sample']}-"
            f"negative {common_kwargs['negative']}-"
            f"min_count {common_kwargs['min_count']}",
    **common_kwargs
)

# Create TaggedDocument objects
tagged_data = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(corpus['processed_text'])]

model.build_vocab(tagged_data)
print("Model: %s : vocabulary scanned & state initialized" % model.comment)

Model: PV-DBOW-v_size 200-epochs 20-hs 0-sample 0-negative 5-min_count 2 : vocabulary scanned & state initialized


In [31]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
print("%s training completed" % model.comment)

PV-DBOW-v_size 200-epochs 20-hs 0-sample 0-negative 5-min_count 2 training completed


In [32]:
from datetime import date
import os

# Save the model
today_date = date.today().isoformat()
base_path = f"../MODELS/{today_date}"
os.makedirs(base_path, exist_ok=True)
model_name = f"{base_path}/{type(model).__name__}_{model.comment.replace(' ', '_')}.model"
model.save(model_name)
print(f"Model saved at {model_name}")

Model saved at ../MODELS/2024-05-13/CommentedDoc2Vec_PV-DBOW-v_size_200-epochs_20-hs_0-sample_0-negative_5-min_count_2.model


In [33]:
# Load the model
model_name = "../MODELS/2024-05-13/CommentedDoc2Vec_PV-DBOW-v_size_200-epochs_20-hs_0-sample_0-negative_5-min_count_2.model"
model = CommentedDoc2Vec.load(model_name)
print(f'Model {model} loaded')

Model CommentedDoc2Vec<"PV-DBOW-v_size 200-epochs 20-hs 0-sample 0-negative 5-min_count 2",dbow,d200,n5,mc2,t8> loaded


### Infer Vectors for all texts

In [36]:
text_to_analyse['processed_text'] = preprocess_text(text_to_analyse['text_to_analyse'])

In [46]:
text_to_analyse['vector'] = text_to_analyse['processed_text'].apply(lambda x: model.infer_vector(x.split()))
text_to_analyse.head()

Unnamed: 0,codigo,id_pieza,desc_problema_translated,descripcion_translated,problema_translated,cod_articulo,text_to_analyse,CODART_A3,CODART,DESCART,CAR1,CAR2,CAR3,CAR4,DESCCAR1,DESCCAR2,DESCCAR3,DESCCAR4,processed_text,vector
0,YZ2YZZUU16,70.0,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MODULO ALARMAS TECNICAS,INDICAN SE HA ESTROPEADO EN LA INSTALACION Y ...,MATS,INDICAN EXCLUSIVAMENTE QUE SE HA ESTROPEADO LA...,MATS,MATS,Central de alarmas técnicas multifunción,3.0,265.0,94.0,,AT HOME,AT HOME,MODULOS DE CONTROL,,indican exclusivamente estropeado iban instala...,"[0.12356957, 0.2817981, -0.21627194, 0.3047181..."
1,ZP2CZZYVBD,71.0,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,,TACTO BLOQUEADO. NO ENCIENDE-APAGA NI DEJA CAM...,AZATACTORSB,AZATACTORSB SE HA QUEDADO BLOQUEADO. NO PERMIT...,AZATACTORSB,AZATACTORSB,Termostato Tacto superficie radio (AZA) - Blanco,1.0,251.0,91.0,4.0,SISTEMAS DE ZONAS,ACUAZONE (DI6),TERMOSTATOS,TACTO,azatactorsb quedado bloqueado permite encender...,"[-0.13428089, 0.25504455, -0.405886, 0.1774784..."
2,YPUEA5WZ10,67.0,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,MODELO 2 PERSIANAS,UNA DE LAS 2 PERSIANAS NO FUNCIONA CON PULSADO...,PER2,DESDE PULSADORES NO SE PUEDE SUBIR LA PERSIANA...,PER2,PER2,Módulo de control de 2 persianas con pulsador,3.0,265.0,94.0,,AT HOME,AT HOME,MODULOS DE CONTROL,,pulsadores subir persianas bajar comprobado en...,"[0.027900798, 0.06624766, -0.036185164, 0.0084..."
3,ZPWBA5ETF7,72.0,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR",,Rafael el viernes llego mi pedido nº 23349 pa...,RINT040015BKMTE,"REJILLA MOTORIZADA DEFECTUOSA, SUSTITUIR Rafa...",RINT040015BKMTE,RINT040015BKMTE,Rejilla Inteligente doble Airzone motorizada 4...,1.0,264.0,31.0,92.0,SISTEMAS DE ZONAS,DIFUSION MOTORIZADA,REJILLAS,RINT,rejilla motorizada defectuosa sustituir rafael...,"[-0.19870485, -0.015116236, -0.035871435, 0.01..."
4,ZP2CAPUAA9,74.0,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,MODULO DE FANCOIL DE ZONA 32Z,AZAMFANCOILC,MODULO DE FANCOIL DA PROBLEMAS LAS COMUNICACI...,AZAMFANCOILC,AZAMFANCOILC,Módulo de zona fancoil cableado Airzone (AZA),1.0,251.0,95.0,,SISTEMAS DE ZONAS,ACUAZONE (DI6),MODULOS DE ZONA,,modulo fancoil problemas comunicaciones instal...,"[-0.11851523, 0.17887713, -0.23689334, 0.15147..."


In [47]:
text_to_analyse.sample(20)[['text_to_analyse', 'processed_text', 'vector']]

Unnamed: 0,text_to_analyse,processed_text,vector
29882,MOTOR COMPUERTA CIRCULAR REG. CAUDAL IONIZACI...,motor compuerta circular reg caudal ionizacion...,"[-0.047765613, -0.051677246, 0.091167524, 0.00..."
17785,3 TERMOSTATOS BLUEFACE AZDI6BLUEFACECB QUEDAN ...,3 termostatos blueface azdi6bluefacecb quedan ...,"[-0.27117088, 0.038511176, -0.090029016, 0.000..."
15688,Se queda la pantalla en negro. Termostato inte...,queda pantalla negro termostato inteligente bl...,"[-0.18633457, 0.025378177, -0.15584484, 0.0938..."
2750,línea directa de expedientes 14246 carta línea...,línea directa expedientes 14246 carta línea di...,"[-0.10982038, 0.057893682, -0.11215552, 0.2319..."
1993,Rejilla motorizada no abre rejilla triple moto...,rejilla motorizada abre rejilla triple motoriz...,"[-0.090225615, -0.018484138, -0.042217854, 0.0..."
23461,STech254930\r\nBOLETO N° 57829 Antena de radio...,stech254930 boleto n ° 57829 antena radio repu...,"[-0.1481104, 0.014739713, -0.16474682, -0.0703..."
5682,Ha dejado de funcionar MODULO DE 2 PERSIANAS C...,dejado funcionar modulo 2 persianas pulsador d...,"[0.015113751, 0.21781668, -0.39325976, -0.0775..."
30201,GARANTIA REJILLA RINT030015BKMRE S/N: 000D2KTX...,garantia rejilla rint030015bkmre s n 000d2ktx ...,"[-0.015580032, -0.23742937, -0.048506454, 0.09..."
13990,"DOS BLUEFACE FLEXA3.0, UNO ESTÁ CON LA PANTALL...",blueface flexa3.0 pantalla blanco responde tac...,"[-0.45100603, 0.43577302, 0.011986234, 0.12945..."
19908,LA PASARELA DE COMUNICACIÓN DE MITSUBITSHI ELE...,pasarela comunicación mitsubitshi electric com...,"[0.029071387, 0.04472198, -0.13551548, 0.17923..."
