In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk
import re

# Spacy
import spacy
nlp=spacy.load('es_core_news_sm')

# Stemmer
from nltk.stem import SnowballStemmer
spanish_stemmer = SnowballStemmer('spanish')

# Levantamos la lista de StopWords
f = open('stopwords_sin_w5.txt', 'r', encoding='utf8')
stopwords = f.read().split('\n')
f.close()
len(stopwords)

265

In [2]:
def PreProcesar(Corpus, POS=False, Lema=True, Stem=True):
    
    
    # Generar una lista de documentos de spacy para tratar el POS Tagging y la Lematización
    docs=[]
    for oracion in Corpus:
        docs.append(nlp(oracion.lower())) #La lematización funciona mejor en minúsculas
    
    # Crear una lista de oraciones, donde cada elemento es una lista de palabras.
    # Cada palabra está definida por una tupla (Texto, POSTag, Lema)
    # Se omiten los tokens que son identificados como signos de puntuación
    oraciones=[]
    for doc in docs:
        oracion=[]
        for token in doc:
            if token.pos_ != 'PUNCT':
                oracion.append((token.text, token.pos_, token.lemma_))
        oraciones.append(oracion)
    
    # Removemos StopWords (finándonos en el lema de cada palabra en vez de su texto!)
    # No conviene quitar las StopWords antes de lematizar pues son útiles para ese proceso...
    oraciones = [[palabra for palabra in oracion if palabra[2] not in stopwords] for oracion in oraciones]
    
    # Stemming
    if Stem==True:
        oraciones_aux=[]
        for oracion in oraciones:
            oracion_aux=[]
            for palabra in oracion:
                p_texto, p_pos, p_lema = palabra
                # Si Lema es True, se Stemmatiza el lema; si no, se Stemmatiza la palabra original
                if Lema==True:
                    oracion_aux.append((p_texto, p_pos, p_lema, spanish_stemmer.stem(p_lema)))
                else:
                    oracion_aux.append((p_texto, p_pos, p_lema, spanish_stemmer.stem(p_texto)))
            oraciones_aux.append(oracion_aux)
        
        oraciones = oraciones_aux
    
    # Finalmente: devolver nuevamente una lista de cadenas como la recibida, pero con el contenido
    # de cada cadena conformado según los parámetros:
    
    Corpus_Procesado = [] #Variable de salida
    
    for doc in oraciones:
        oracion = ''
        for palabra in doc:
            if Stem == True:
                # Devolver cadena de Stemming
                oracion = oracion + palabra[3]
            else:
                if Lema == True:
                    # Devolver cadena de Lemas
                    oracion = oracion + palabra[2]
                else:
                    # Devolver cadena de palabras originales
                    oracion = oracion + palabra[0]
            
            if POS == True:
                #Concatenar POS a cada palabra
                oracion = oracion + '_' + palabra[1].lower()
            
            oracion = oracion + ' '
        
        Corpus_Procesado.append(oracion)
        
    return Corpus_Procesado

def Corregir_Documentos(df_textos, columnas, POS=False, Lema=True, Stem=True):

    for col in columnas:
        df_textos[col] = PreProcesar(list(df_textos[col]), POS, Lema, Stem)
    
    # Sanear el DataFrame eliminando los duplicados y reindexándolo
    df_textos = df_textos.drop_duplicates().reset_index(drop=True)
    
    return df_textos

def Generar_Matriz_BOW(df_textos, columna, binario=False, ngram=(1,2)):
    
    # Vectorizar, usando CountVectorizer de sklearn.feature_extraction.text
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizador = CountVectorizer(binary=binario, ngram_range=ngram)
    X = vectorizador.fit_transform(df_textos[columna])
    
    # Generar el DataFrame a devolver
    df_X = pd.DataFrame(X.toarray(), columns=vectorizador.get_feature_names())
    df = df_textos.join(df_X)
    
    return vectorizador, df

def Generar_Matriz_Tfidf(df_textos, columna, ngram=(1,2)):
    
    # Vectorizar... Directamente usar aquí el TfidfVectorizer de sklearn en vez del CountVectorizer
    # (Lleva los mismos parámetros y directamente nos devuelve la matriz con los vectores Tf*Idf)
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizador = TfidfVectorizer(ngram_range=ngram)
    X = vectorizador.fit_transform(df_textos[columna])
    
    # Generar el DataFrame a devolver
    df_X = pd.DataFrame(X.toarray(), columns=vectorizador.get_feature_names())
    df = df_textos.join(df_X)
    
    return vectorizador, df

def Distancia_Coseno(u, v):

    distancia = 1.0 - (np.dot(u, v) / (np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v)))))
    return distancia

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

#1. Cargar y corregir el corpus
df_textos = pd.read_csv('data_w5_.csv', sep=';', encoding='utf_8')
df_textos = Corregir_Documentos(df_textos,['oracion'],False,True,True)

#2. Modelizar los documentos de df_textos
vectorizador, df_textos = Generar_Matriz_Tfidf(df_textos,'oracion',ngram=(1,2))
#vectorizador, df_textos = Generar_Matriz_BOW(df_textos,'oracion')

#3. Separar el corpus en Train/Test
X = df_textos.drop(['w'],axis=1)
y = df_textos[['w']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=124)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [4]:
df_textos.head(10)

Unnamed: 0,oracion,w,carg,clas,cobr,com,com hac,com que,com ten,cost,...,tard,ten,ten que,ten tiemp,termin,termin plaz,tiemp,ubic,val,ver
0,com,como,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,com ten que hac,como,0.0,0.0,0.0,0.378915,0.0,0.0,0.460139,0.0,...,0.0,0.331402,0.360852,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,com hac,como,0.0,0.0,0.0,0.505665,0.677465,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,com ten que,como,0.0,0.0,0.0,0.467144,0.0,0.0,0.56728,0.0,...,0.0,0.408568,0.444874,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,com que,como,0.0,0.0,0.0,0.556341,0.0,0.745358,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,que ten que hac,como,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.317401,0.345606,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,cuand,cuando,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,cuand empez,cuando,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,cuand termin,cuando,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.634899,0.0,0.0,0.0,0.0,0.0
9,que fech,cuando,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df_comparativa = pd.DataFrame(columns=['Modelo','Umbral','Aciertos',
'Errores','Indeterm','Precision','Recall','Accuracy','F1','df_Aciertos','df_Errores','df_Indeterm'])

#Distancia_Coseno del profe

def Evaluar_Modelo_Distancia_Coseno(X_train, X_test, y_test, umbral=0.8):
    # Recorrer todo el Test Set prediciendo para cada oración de X_test y comparando con y_test
    global df_comparativa
    df_test = X_test[['oracion']].join(y_test)
    array_X = X_train[X_train.columns[1:]].values
    lista_distancia = [] #Distancia con el elemento más cercano (predicción)
    lista_predic    = [] #Predicción del elemento más cercano
    lista_similar   = [] #Texto del elemento más cercano
    for test_doc in df_test.iterrows():
        i,d = test_doc
        df_query = pd.DataFrame([d['oracion']],columns=['oracion'])
        Q = vectorizador.transform(df_query['oracion'])
        distancia = [Distancia_Coseno(Q.A[0],fila) for fila in array_X]
        df_Resultado = pd.DataFrame(distancia, columns=['Distancia']).join(X_train[['oracion']]).join(y_train[['w']]).sort_values(by='Distancia').head(1)

        lista_distancia.append(df_Resultado.iloc[0]['Distancia'])
        lista_predic.append(df_Resultado.iloc[0]['w']) 
        lista_similar.append(df_Resultado.iloc[0]['oracion'])

    # Agregar columnas con resultados predichos al df_test
    df_test['Distancia'] = lista_distancia
    df_test['Predic']    = lista_predic
    df_test['Similar']   = lista_similar

    # Evaluar el resultado en df_test
    print('Cantidad de registros evaluados:', len(df_test))
    print('--------------------')
    aciertos = df_test[ ( df_test['w'] == df_test['Predic'] ) & 
                        ( df_test['Distancia'] <= umbral ) ]['w'].count()
    errores  = df_test[ ( df_test['w'] != df_test['Predic'] ) & 
                        ( df_test['Distancia'] <= umbral) ]['w'].count()
    indeterm = df_test[ ( df_test['Distancia'] > umbral) ]['Distancia'].count()
    print('Aciertos:', aciertos)
    print('Errores :', errores)
    print('Indeterm:', indeterm)
    print('--------------------')
    precision = aciertos/(aciertos+errores)
    recall    = aciertos/(aciertos+indeterm)
    accuracy  = (aciertos+indeterm)/(aciertos+errores+indeterm)
    F1        = 2*((precision*recall)/(precision+recall))
    print('Precision: {0:.3f} <- aciertos/(aciertos+errores)'.format(precision))
    print('Recall   : {0:.3f} <- aciertos/(aciertos+indeterm)'.format(recall))
    print('Accuracy : {0:.3f} <- (aciertos+indeterm)/(aciertos+errores+indeterm)'.format(accuracy))
    print('F1       : {0:.3f} <- 2*((precision*recall)/(precision+recall))'.format(F1))

    df_comparativa = df_comparativa.append({'Modelo': 'Distancia coseno',
                                            'Umbral': umbral,
                                            'Aciertos': aciertos,
                                            'Errores': errores,
                                            'Indeterm': indeterm,
                                            'Precision': precision,
                                            'Recall': recall,
                                            'Accuracy': accuracy,
                                            'F1': F1,
                                            'df_Aciertos': df_test[ ( df_test['w'] == df_test['Predic'] ) & ( df_test['Distancia'] <= umbral ) ],
                                            'df_Errores' : df_test[ ( df_test['w'] != df_test['Predic'] ) & ( df_test['Distancia'] <= umbral ) ],
                                            'df_Indeterm': df_test[ ( df_test['Distancia'] > umbral ) ]
                                            }, ignore_index=True)
    
    return

In [6]:
Evaluar_Modelo_Distancia_Coseno(X_train, X_test, y_test, umbral=0.8)

Cantidad de registros evaluados: 11
--------------------
Aciertos: 7
Errores : 2
Indeterm: 2
--------------------
Precision: 0.778 <- aciertos/(aciertos+errores)
Recall   : 0.778 <- aciertos/(aciertos+indeterm)
Accuracy : 0.818 <- (aciertos+indeterm)/(aciertos+errores+indeterm)
F1       : 0.778 <- 2*((precision*recall)/(precision+recall))


In [10]:
from sklearn.linear_model import LogisticRegression

# Crear el modelo con los parámetros que no cambiarán: observar que no le pasamos el valor de C de regulrización
RLog=LogisticRegression(penalty='none', max_iter=10000, tol=0.00001, multi_class='ovr',)

# Armar el diccionario con el nombre y valores para los Hiperparámetros
parametros_RLog = {'C':[1]}

# Armar el GridSearchCV
grid_RLog = GridSearchCV(estimator = RLog,scoring = 'accuracy',param_grid = parametros_RLog, cv = 5,
                        n_jobs = -1)

# Entrenar con el Train Set
grid_RLog.fit(X_train[X_train.columns[1:]].values, y_train);

# Obtener el mejor AC 
AC_RLog_best=grid_RLog.best_score_
print('Mejor accuracy: ' + str(round(AC_RLog_best,4)))

C_RLog_best=grid_RLog.best_params_ 
print('Mejor C: ' + str(C_RLog_best))
###########################################################################################################
def Evaluar_Modelo(modelo, nombre_modelo, X_test, y_test, umbral=0.7):
    # Recorrer todo el Test Set prediciendo para cada oración de X_test y comparando con y_test
    global df_comparativa
    df_test = X_test[['oracion']].join(y_test)
    lista_predic       = [] #Predicción
    lista_probabilidad = [] #Probabilidad de la predicción
    for test_doc in df_test.iterrows():
        i,d = test_doc
        df_query = pd.DataFrame([d['oracion']],columns=['oracion'])
        Q = vectorizador.transform(df_query['oracion'])
        pronostico = modelo.predict([Q.A[0]])
        probabilidad = modelo.predict_proba([Q.A[0]])
        lista_predic.append(pronostico[0])
        lista_probabilidad.append(probabilidad[0].max())
    
    # Agregar columnas con resultados predichos al df_test
    df_test['Probabilidad'] = lista_probabilidad
    df_test['Predic'] = lista_predic
    
    # Evaluar el resultado en df_test
    print('Cantidad de registros evaluados:', len(df_test))
    print('--------------------')
    aciertos = df_test[ ( df_test['w'] == df_test['Predic'] ) & 
                        ( df_test['Probabilidad'] >= umbral ) ]['w'].count()
    errores  = df_test[ ( df_test['w'] != df_test['Predic'] ) & 
                        ( df_test['Probabilidad'] >= umbral) ]['w'].count()
    indeterm = df_test[ ( df_test['Probabilidad'] < umbral) ]['Probabilidad'].count()
    
    print('Aciertos:', aciertos)
    print('Errores :', errores)
    print('Indeterm:', indeterm)
    print('--------------------')
    precision = aciertos/(aciertos+errores)
    recall    = aciertos/(aciertos+indeterm)
    accuracy  = (aciertos+indeterm)/(aciertos+errores+indeterm)
    F1        = 2*((precision*recall)/(precision+recall))
    print('Precision: {0:.3f} <- aciertos/(aciertos+errores)'.format(precision))
    print('Recall   : {0:.3f} <- aciertos/(aciertos+indeterm)'.format(recall))
    print('Accuracy : {0:.3f} <- (aciertos+indeterm)/(aciertos+errores+indeterm)'.format(accuracy))
    print('F1       : {0:.3f} <- 2*((precision*recall)/(precision+recall))'.format(F1))

    # Registrar Resultados
    df_comparativa = df_comparativa.append({'Modelo': nombre_modelo,
                                            'Umbral': umbral,
                                            'Aciertos': aciertos,
                                            'Errores': errores,
                                            'Indeterm': indeterm,
                                            'Precision': precision,
                                            'Recall': recall,
                                            'Accuracy': accuracy,
                                            'F1': F1,
                                            'df_Aciertos': df_test[ ( df_test['w'] == df_test['Predic'] ) & ( df_test['Probabilidad'] >= umbral ) ],
                                            'df_Errores' : df_test[ ( df_test['w'] != df_test['Predic'] ) & ( df_test['Probabilidad'] >= umbral ) ],
                                            'df_Indeterm': df_test[ ( df_test['Probabilidad'] < umbral ) ]
                                            }, ignore_index=True)
    return



Mejor accuracy: 0.5472
Mejor C: {'C': 1}


  return f(*args, **kwargs)


In [11]:
Evaluar_Modelo(grid_RLog, 'Regresión Logística Sin Regularización', X_test, y_test, umbral=0.8)

Cantidad de registros evaluados: 11
--------------------
Aciertos: 6
Errores : 3
Indeterm: 2
--------------------
Precision: 0.667 <- aciertos/(aciertos+errores)
Recall   : 0.750 <- aciertos/(aciertos+indeterm)
Accuracy : 0.727 <- (aciertos+indeterm)/(aciertos+errores+indeterm)
F1       : 0.706 <- 2*((precision*recall)/(precision+recall))


In [9]:
df_comparativa

Unnamed: 0,Modelo,Umbral,Aciertos,Errores,Indeterm,Precision,Recall,Accuracy,F1,df_Aciertos,df_Errores,df_Indeterm
0,Distancia coseno,0.8,7,2,2,0.777778,0.777778,0.818182,0.777778,oracion w Distancia Pre...,oracion w Distancia Predic S...,oracion w Distancia Predic ...
1,Regresión Logística Sin Regularización,0.8,6,3,2,0.666667,0.75,0.727273,0.705882,oracion w Probabilidad ...,oracion w Probabilidad Pre...,oracion w Probabilidad Pr...


In [37]:
df_comparativa.iloc[0].df_Errores

Unnamed: 0,oracion,w,Distancia,Predic,Similar
3,que oficin,donde,0.670934,que,que
10,que laboratori,donde,0.670934,que,que


In [38]:
df_comparativa.iloc[1].df_Errores

Unnamed: 0,oracion,w,Probabilidad,Predic
1,cuant sal,cuanto,0.995611,cuando


In [12]:
RLog=LogisticRegression(penalty='none', max_iter=10000, tol=0.00001, multi_class='ovr',)

parametros_RLog = {'C':[1]}

grid_RLog1 = GridSearchCV(estimator = RLog,scoring = 'accuracy',param_grid = parametros_RLog, cv = 5, n_jobs = -1)

grid_RLog1.fit(X[X.columns[1:]].values, y)


AC_RLog_best=grid_RLog1.best_score_
print('Mejor accuracy: ' + str(round(AC_RLog_best,4)))

C_RLog_best=grid_RLog1.best_params_ 
print('Mejor C: ' + str(C_RLog_best))

Mejor accuracy: 0.5636
Mejor C: {'C': 1}


  return f(*args, **kwargs)


In [13]:
Evaluar_Modelo(grid_RLog1, 'Regresión Logística Sin Regularización', X_test, y_test, umbral=0.8)

Cantidad de registros evaluados: 11
--------------------
Aciertos: 11
Errores : 0
Indeterm: 0
--------------------
Precision: 1.000 <- aciertos/(aciertos+errores)
Recall   : 1.000 <- aciertos/(aciertos+indeterm)
Accuracy : 1.000 <- (aciertos+indeterm)/(aciertos+errores+indeterm)
F1       : 1.000 <- 2*((precision*recall)/(precision+recall))


In [15]:
"""import pickle

# Grabar el modelo elegido
nombre_archivo='modelo_w5.sav'
pickle.dump(grid_RLog1, open(nombre_archivo, 'wb'))

# Grabar el vectorizador
nombre_archivo='vectorizador_w5.sav'
pickle.dump(vectorizador, open(nombre_archivo, 'wb'))"""

In [14]:
import pickle

# Grabar el modelo elegido

pickle.dump(grid_RLog1, open("modelo_w5.sav", "wb"))

# Grabar el vectorizador
pickle.dump(vectorizador, open("vectorizador_w5.pkl", "wb"))

pickle.dump(df_textos, open(r'df_matriz_carreras.pkl', 'wb'))