<a href="https://colab.research.google.com/github/trnq-eu/text-classifier/blob/main/text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U spacy==3.*
!pip install PyPDF2
!python -m spacy download it_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
2023-05-09 16:31:43.701789: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting it-core-news-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_new

In [None]:
import os
import spacy
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


## Collegamento a Google Drive
In questo modo è possibile esplorare documenti dalle proprie cartelle di Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# carica il corpus italiano di spacy (large)
nlp = spacy.load('it_core_news_lg')

# elenco delle stop word italiane
italian_stop_words = list(nlp.Defaults.stop_words)

Carica i dati di training e crea una lista di testi e una lista di etichette corrispondenti. In questo caso le diverse cartelle di documenti sono state inserite all'interno di Google Drive in una cartella denominata "DATI/comunicati_classificati/".

In [None]:
# Definisci le etichette e le cartelle dei dati di training
labels = ['Prodotto', 'Heritage', 'HR',  'Sustainability']
folders = ['/content/drive/MyDrive/DATI/comunicati_classificati/Prodotto', 
           '/content/drive/MyDrive/DATI/comunicati_classificati/Heritage',
           '/content/drive/MyDrive/DATI/comunicati_classificati/HR', 
           '/content/drive/MyDrive/DATI/comunicati_classificati/Sustainability']

# Crea le liste vuote di testi e etichette
texts = []
labels_list = []

# Loop attraverso le cartelle di training per caricare i testi e le etichette
for i, folder in enumerate(folders):
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'rb') as f:
            pdf = PyPDF2.PdfReader(f)
            text = ''
            for page in pdf.pages:
                text += page.extract_text()
            texts.append(text)
            labels_list.append(i)


Funzione di preprocessamento dei testi che elimina dai testi dei documenti tutte le parole inutili (*stop words*).

In [None]:
# carica lo spacy model per l'italiano
nlp = spacy.load('it_core_news_lg')

# funzione di pre-elaborazione personalizzata
def preprocess_text(text):
    # rimuovi le stop words in italiano
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    # unisci i token in una stringa
    return ' '.join(tokens)

In [None]:
# stampa le etichette delle classi
print(labels_list)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]


Di seguito otteniamo la vettorizzazione con TfidfVectorizer (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) e scegliamo il classificatore MultinomialNB (https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html), un classificatore Bayesiano particolarmente efficace per la classificazione multiclasse. 


In [None]:
# vectorizer = TfidfVectorizer(preprocessor=preprocess_text, ngram_range=(1, 2))
vectorizer = TfidfVectorizer(preprocessor=preprocess_text)


# definisce un classificatore
clf = MultinomialNB()

# definisce un pipeline
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', clf)
])



In [None]:
import os
import PyPDF2
import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

# Carica il modello di lingua italiana di Spacy
nlp = spacy.load('it_core_news_lg')

# Definisci una funzione per estrarre il testo da un file PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ''
        for i in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[i].extract_text()
    return text

# Definisci una funzione per elaborare il testo utilizzando Spacy
def preprocess_text(text):
    doc = nlp(text)
    # rimuovi le stop words e i simboli di punteggiatura
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

# Definisci la pipeline di elaborazione e classificazione
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocess_text)),
    ('clf', LinearSVC())
])

# Crea un DataFrame vuoto per i dati di addestramento
df = pd.DataFrame(columns=['text', 'category'])

# Itera attraverso le cartelle dei dati di addestramento
for category in ['Prodotto', 'Heritage', 'HR', 'Sustainability']:
    category_path = os.path.join('/content/drive/MyDrive/DATI/comunicati_classificati', category)
    for file_name in os.listdir(category_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(category_path, file_name)
            text = extract_text_from_pdf(file_path)
            df = pd.concat([df, pd.DataFrame({'text': text, 'category': category}, index=[0])], ignore_index=True)


# Addestra il modello
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.1, random_state=42)
pipeline.fit(X_train, y_train)

# Valuta il modello sui dati di test
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Utilizza il modello per classificare i file PDF sconosciuti
unknown_path = os.path.join('/content/drive/MyDrive/DATI/comunicati_classificati', 'Unknown')
for file_name in os.listdir(unknown_path):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(unknown_path, file_name)
        text = extract_text_from_pdf(file_path)
        category = pipeline.predict([text])[0]
        print(f"{file_name}: {category}")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

            HR       1.00      1.00      1.00         1
      Heritage       1.00      0.50      0.67         2
      Prodotto       0.00      0.00      0.00         0
Sustainability       0.00      0.00      0.00         1

      accuracy                           0.50         4
     macro avg       0.50      0.38      0.42         4
  weighted avg       0.75      0.50      0.58         4

Nuova Abarth 500e elettrizza l'Europa _ Abarth _ Stellantis.pdf: Prodotto
IT-20230428-Stellantis-2022-CSR-Report.pdf: Sustainability
20221116_Stellantis_Names_New_Head_of_IR_IT.pdf: HR
20230118-Stellantis-Board-Composition-Change-IT.pdf: HR
Opel Vivaro-e e Opel Vivaro-e HYDROGEN.pdf: Prodotto
Thierry Koskas è nominato Chief Executive Officer del marchio Citroën Stellantis.pdf: Sustainability
Nuova PEUGEOT 408.pdf: Prodotto
salone_auto_epoca.pdf: Heritage
centenario_autodromo_monza.pdf: Heritage
economia_circolare_mirafiori.pdf: Sustainabilit