In [1]:
import pattern_extractor as pe

In [10]:
import pandas as pd
from collections import defaultdict, Counter

def pattern_analysis(filepath, tex_col_name, senti_col_name):

    # ==========================================================
    # CARGAR DATASET
    # ==========================================================

    # Cambia la ruta según tu entorno
    df = pd.read_csv(filepath)

    # Nos quedamos solo con texto y sentimiento
    df = df[[tex_col_name, senti_col_name]]

    # Convertimos sentimiento a str por facilidad
    df[senti_col_name] = df[senti_col_name].astype(str)


    # ==========================================================
    # ACUMULAR PATRONES
    # ==========================================================

    pattern_stats = defaultdict(lambda: {"pos": 0, "neg": 0})

    for idx, row in df.iterrows():
        text = row[tex_col_name]
        label = row[senti_col_name]  # "0" = neg, "1" = pos

        patterns = pe.extract_filtered_patterns(text)

        for p in patterns:
            if label == "1":
                pattern_stats[p]["pos"] += 1
            else:
                pattern_stats[p]["neg"] += 1


    # ==========================================================
    # CONVERTIR A DATAFRAME PARA ANÁLISIS
    # ==========================================================

    rows = []
    for pattern, stats in pattern_stats.items():
        pos = stats["pos"]
        neg = stats["neg"]
        total = pos + neg

        if total == 0:
            continue

        # Polaridad calculada
        polarity_strength = (pos - neg) / total  # → 1 = fuerte positivo, -1 = fuerte negativo

        rows.append({
            "pattern": pattern,
            "pos": pos,
            "neg": neg,
            "total": total,
            "polarity_strength": polarity_strength
        })

    patterns_df = pd.DataFrame(rows)

    # Ordenar por frecuencia total
    top_by_freq = patterns_df.sort_values("total", ascending=False)

    # Ordenar por polaridad positiva
    top_positive = patterns_df.sort_values("polarity_strength", ascending=False)

    # Ordenar por polaridad negativa
    top_negative = patterns_df.sort_values("polarity_strength")
    
    return df, patterns_df, top_by_freq, top_positive, top_negative

df, patterns_df, top_by_freq, top_positive, top_negative = pattern_analysis("datasets/errores_clasificacion.csv", "texto", "etiqueta_real")

In [4]:
top_by_freq.head(20)

Unnamed: 0,pattern,pos,neg,total,polarity_strength
130,"(CHILD_REL, tener, advmod, no)",0,4,4,-1.0
129,"(HEAD_REL, tener, advmod, no)",0,4,4,-1.0
9,"(HEAD_REL, bajo, nsubj, calidad)",0,4,4,-1.0
11,"(CHILD_REL, bajo, nsubj, calidad)",0,4,4,-1.0
74,"(CHILD_REL, decepcionante, advmod, bastante)",0,3,3,-1.0
72,"(HEAD_REL, decepcionante, advmod, bastante)",0,3,3,-1.0
71,"(HEAD_REL, festival, amod, cultural)",0,3,3,-1.0
70,"(SUBTREE, festival, (cultural, festival))",0,3,3,-1.0
69,"(CHILD_REL, festival, amod, cultural)",0,3,3,-1.0
159,"(HEAD_REL, rendimiento, amod, académico)",3,0,3,1.0


In [5]:
top_positive.head(20)

Unnamed: 0,pattern,pos,neg,total,polarity_strength
538,"(SUBTREE, fomentar, (contribuir, enseñando él,...",1,0,1,1.0
539,"(HEAD_REL, responsabilidad, amod, social)",1,0,1,1.0
499,"(HEAD_REL, cortometraj, amod, independiente)",1,0,1,1.0
500,"(SUBTREE, refrescante, (creativo, independient...",1,0,1,1.0
501,"(HEAD_REL, historia, amod, original)",1,0,1,1.0
502,"(SUBTREE, original, (creativo, original))",1,0,1,1.0
503,"(SUBTREE, colorir, (alegre, colorir, cotidiano...",1,0,1,1.0
504,"(SUBTREE, mostrar, (cotidiano, mostrar))",1,0,1,1.0
505,"(HEAD_REL, vida, amod, cotidiano)",1,0,1,1.0
487,"(HEAD_REL, charla, amod, motivacional)",1,0,1,1.0


In [6]:
top_negative.head(20)

Unnamed: 0,pattern,pos,neg,total,polarity_strength
321,"(CHILD_REL, nada, amod, nuevo)",0,1,1,-1.0
347,"(HEAD_REL, profesor, amod, calificado)",0,1,1,-1.0
346,"(SUBTREE, necesitar, (enfrentar, necesitar))",0,1,1,-1.0
345,"(SUBTREE, dejar, (apoyo, dejar, emocional, enf...",0,1,1,-1.0
344,"(SUBTREE, emocional, (emocional, psicológico))",0,1,1,-1.0
343,"(HEAD_REL, apoyo, amod, emocional)",0,1,1,-1.0
342,"(SUBTREE, apoyo, (apoyo, emocional, psicológico))",0,1,1,-1.0
341,"(CHILD_REL, apoyo, amod, emocional)",0,1,1,-1.0
326,"(HEAD_REL, información, amod, suficiente)",0,1,1,-1.0
325,"(HEAD_REL, haber, advmod, no)",0,1,1,-1.0


In [7]:
min_support = 4
filtered = patterns_df[patterns_df["total"] >= min_support]
filtered


Unnamed: 0,pattern,pos,neg,total,polarity_strength
9,"(HEAD_REL, bajo, nsubj, calidad)",0,4,4,-1.0
11,"(CHILD_REL, bajo, nsubj, calidad)",0,4,4,-1.0
129,"(HEAD_REL, tener, advmod, no)",0,4,4,-1.0
130,"(CHILD_REL, tener, advmod, no)",0,4,4,-1.0


In [8]:
def useful_patterns(df, min_support=4, min_strength=0.4):
    """
    Filtra patrones realmente útiles para reglas:
    - soporte mínimo
    - fuerza de polarización mínima
    """
    return df[
        (df["total"] >= min_support) &
        (df["polarity_strength"].abs() >= min_strength)
    ].sort_values("polarity_strength")

useful = useful_patterns(patterns_df, min_support=4, min_strength=0.5)
useful

Unnamed: 0,pattern,pos,neg,total,polarity_strength
9,"(HEAD_REL, bajo, nsubj, calidad)",0,4,4,-1.0
11,"(CHILD_REL, bajo, nsubj, calidad)",0,4,4,-1.0
129,"(HEAD_REL, tener, advmod, no)",0,4,4,-1.0
130,"(CHILD_REL, tener, advmod, no)",0,4,4,-1.0


In [7]:
# Renderizar con displacy sin depender de IPython.display.display (que falta)
from spacy import displacy
import spacy
nlp = spacy.load('es_core_news_md')
doc = nlp("No puedo decir nada bueno de ti")
# Generar HTML con displacy
html = displacy.render(doc, style='dep', jupyter=False)

# Intentar usar display_html (presente en IPython.core.display en esta venv)
try:
    from IPython.core import display as ipd
    ipd.display_html(html, raw=True)
except Exception as e:
    # Fallback: escribir a archivo para abrir en el navegador
    with open('displacy_output.html', 'w', encoding='utf8') as f:
        f.write(html)
        print('No se pudo mostrar en el notebook:', e)
        print("Se ha escrito 'displacy_output.html' en el directorio de trabajo.")

In [3]:
# Renderizar con displacy sin depender de IPython.display.display (que falta)
from spacy import displacy
import spacy
nlp = spacy.load('es_core_news_md')


def print_tree(text):
    doc = nlp(text)
    # Generar HTML con displacy
    html = displacy.render(doc, style='dep', jupyter=False)

    # Intentar usar display_html (presente en IPython.core.display en esta venv)
    try:
        from IPython.core import display as ipd
        ipd.display_html(html, raw=True)
    except Exception as e:
        # Fallback: escribir a archivo para abrir en el navegador
        with open('displacy_output.html', 'w', encoding='utf8') as f:
            f.write(html)
            print('No se pudo mostrar en el notebook:', e)
            print("Se ha escrito 'displacy_output.html' en el directorio de trabajo.")

In [7]:
print_tree("El evento prometía una experiencia cultural completa, pero resultó ser una presentación monótona sin ningún valor añadido para el público.")

In [1]:
import spacy
nlp = spacy.load('es_core_news_md')
doc = nlp("reírse")
doc[0].lemma_, doc[0].lemma

('reir él', 3039775346437375000)

In [None]:
from sentiment_markov_chain_binary import SentimentMarkovChain
from sklearn.metrics import accuracy_score, classification_report

markov_chain = SentimentMarkovChain(vector_size=100, window=5, min_count=2)
markov_chain.load_model("models/sentiment_markov_chain_datasets.pkl")

df = pd.read_csv('datasets/dataset.csv')
test_texts, test_sentiments =  df['texto'].tolist() , df['sentimiento'].tolist()


predictions = []
for text in test_texts:
    pred = markov_chain.predict_sentiment(text)
    predictions.append(pred)

# Calcular métricas
accuracy = accuracy_score(test_sentiments, predictions)
print(f"Precisión del modelo: {accuracy:.4f}")

# Reporte de clasificación
print("\nReporte de clasificación:")
print(classification_report(test_sentiments, predictions,
                            target_names=['Negativo (0)', 'Positivo (1)']))


Modelo cargado desde: models/sentiment_markov_chain_reviews.pkl
Precisión del modelo: 0.5800

Reporte de clasificación:
              precision    recall  f1-score   support

Negativo (0)       0.58      0.59      0.59       150
Positivo (1)       0.58      0.57      0.57       150

    accuracy                           0.58       300
   macro avg       0.58      0.58      0.58       300
weighted avg       0.58      0.58      0.58       300

