In [1]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import date
import os
import spacy
nlp = spacy.load('es_core_news_sm')
from bs4 import BeautifulSoup
import PyPDF2
pd.options.mode.chained_assignment = None

TRAINING_DATA_PATH = '..\\..\\Datos - Myzone\\TrainningData'

# Incidencias MyZone

In [4]:
def query_data(query):
    """
    Function to query data from the database using sqlalchemy
    :param query: 
    :return: pd.DataFrame
    
    Connection parameters:
    user = readmyzone
    password = (get from environment variable MYSQL_PASSWORD)
    host = 192.168.2.7
    port = 3306
    """
    
    # Create the connection string
    user = 'readmyzone'
    password = os.environ.get('MYSQL_PASSWORD')
    host = '192.168.2.7'
    port = '3306'
    db = 'myzone'
    connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{db}'
    
    # Create the engine
    engine = create_engine(connection_string)
    
    try:
        # Query the data
        data = pd.read_sql(query, engine)
    except Exception as e:
        print(e)
        data = None
    
    return data

In [5]:
sav_incidencias = query_data('SELECT * FROM sav_incidencias')
sav_piezas = query_data('SELECT * FROM sav_piezas')
sav_estados = query_data('SELECT * FROM sav_estados')
sav_incidencias_tipo = query_data('SELECT * FROM sav_incidencias_tipo')

In [6]:
dataset = sav_incidencias.merge(sav_piezas, left_on='codigo', right_on='codigo_incidencia', how='left', suffixes=(None, '_pieza'))
dataset = dataset.merge(sav_estados, left_on='estado', right_on='id', how='left', suffixes=(None, '_estado'))
dataset = dataset.merge(sav_incidencias_tipo, left_on='tipo', right_on='id', how='left', suffixes=(None, '_tipo'))

In [7]:
dataset['modification_date'] = pd.to_datetime(dataset['modification_date'], errors='coerce')
clean_dataset = dataset[(dataset["tipo"] == 1) & (dataset["estado"].isin([2,6])) & (dataset['modification_date'] < '2024-05-09')]

In [8]:
# Load from disk the text to translate dictionary
fields_to_translate = ["desc_problema", "problema", "descripcion"]
text_to_translate = {}
for text in fields_to_translate:
    text_to_translate[text] = pd.read_csv(f"../DATA/{text}.csv", sep='¬', encoding='utf-8-sig')

In [9]:
desc_problema_translated = pd.read_csv("../DATA/desc_problema_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')
descripcion_translated = pd.read_csv("../DATA/descripcion_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')
problema_translated = pd.read_csv("../DATA/problema_translated.csv", sep='¬', encoding='utf-8-sig', engine='python')# Data preprocessing (Merging the translated text)

In [10]:
# Delete rows with values (desc_problema, desc_problema_translated)
desc_problema_translated = desc_problema_translated[~desc_problema_translated["desc_problema_translated"].isin(["desc_problema_translated"])]
descripcion_translated = descripcion_translated[~descripcion_translated["descripcion_translated"].isin(["descripcion_translated"])]
problema_translated = problema_translated[~problema_translated["problema_translated"].isin(["problema_translated"])]

In [11]:
# Merge the translated text with the text_to_translate dataframe
desc_problema_translated = text_to_translate["desc_problema"].merge(desc_problema_translated, left_on="desc_problema", right_on="desc_problema", how="left")
descripcion_translated = text_to_translate["descripcion"].merge(descripcion_translated, left_on="descripcion", right_on="descripcion", how="left")
problema_translated = text_to_translate["problema"].merge(problema_translated, left_on="problema", right_on="problema", how="left")

In [12]:
# Fill NA with the original texts
desc_problema_translated.fillna({"desc_problema_translated": desc_problema_translated["desc_problema"]}, inplace=True)
descripcion_translated.fillna({"descripcion_translated": descripcion_translated["descripcion"]}, inplace=True)
problema_translated.fillna({"problema_translated": problema_translated["problema"]}, inplace=True)

In [13]:
# Merge the translated text with the original dataset
clean_dataset = clean_dataset.merge(desc_problema_translated, left_on="desc_problema", right_on="desc_problema", how="left")
clean_dataset = clean_dataset.merge(descripcion_translated, left_on="descripcion", right_on="descripcion", how="left")
clean_dataset = clean_dataset.merge(problema_translated, left_on="problema", right_on="problema", how="left")

In [14]:
# Get only the columns with the fields of interest
incidencias = clean_dataset[['codigo','id_pieza','desc_problema_translated','descripcion_translated','problema_translated','cod_articulo']]
# Fill NA with empty string
incidencias.fillna("", inplace=True)

In [15]:
incidencias.loc[:, 'text_to_analyse'] = incidencias['desc_problema_translated'] + ' ' + incidencias['descripcion_translated'] + ' ' + incidencias['problema_translated'] + ' ' + incidencias['cod_articulo']

In [16]:
incidencias = incidencias[['text_to_analyse']]

# FAQ

In [17]:
faq_path = os.path.join(TRAINING_DATA_PATH, 'FAQ.csv')
faq = pd.read_csv(faq_path, sep=";", header=None)
faq.columns = ['text_to_analyse']

In [18]:
# Remove html tags
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

faq['text_to_analyse'] = faq['text_to_analyse'].apply(remove_html_tags)

# Products documentation

In [27]:
product_documentation_path = r"\\central4\Publica\Product_technical_documentation-Documentación_técnica_producto"

def get_pdf_files(path):
    pdf_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

def extract_text_from_pdf(pdf_path):
    text = []
    sentences = []
    try:
        with open(pdf_path, 'rb') as file:
            pdf = PyPDF2.PdfReader(file)
            for page in range(len(pdf.pages)):
                text.append(pdf.pages[page].extract_text())
                
        for i, page in enumerate(text):
            doc = nlp(page)
            for sentence in doc.sents:
                sentences.append(sentence.text)
    except Exception as e:
        print(f'Error processing {pdf_path}: {e}')
        raise e
    
    return pd.DataFrame(sentences, columns=['text_to_analyse'])

In [28]:
import concurrent.futures
from tqdm import tqdm # Progress bar

pdfs = get_pdf_files(product_documentation_path)
product_documentation = pd.DataFrame()

def process_pdf(pdf):
    #print(f'Processing {pdf}')
    try:
        df = extract_text_from_pdf(pdf)
        return df
    except Exception as e:
        return pd.DataFrame()

# Process the PDF files in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Map the process_pdf function to all the PDF files
    results = list(tqdm(executor.map(process_pdf, pdfs), total=len(pdfs)))
    
    # Concatenate results as they complete
    for result in results:
        product_documentation = pd.concat([product_documentation, result])

"""for pdf in pdfs:
    print(f'Processing {pdf}')
    text = extract_text_from_pdf(pdf)
    product_documentation = pd.concat([product_documentation, text])"""

In [29]:
product_documentation

# Catalogo de productos

In [30]:
catalogo_path = os.path.join(TRAINING_DATA_PATH, 'catalogo.pdf')
catalogo = extract_text_from_pdf(catalogo_path)

# Join all the data

In [31]:
corpus = pd.concat([incidencias, faq, catalogo, product_documentation])
print(f'Corpus shape: {corpus.shape}')

In [32]:
corpus.sample(10)

In [33]:
# Save the corpus to disk
today_date = date.today().isoformat()
data_base_path = f"../DATA/processed/{today_date}"
os.makedirs(data_base_path, exist_ok=True)
corpus.to_csv(f"{data_base_path}/corpus.csv", sep='¬', index=False)