In [15]:
import numpy as np
import pandas as pd
import sklearn
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk
import re

# Spacy
import spacy
nlp=spacy.load('es_core_news_sm')

# Stemmer
from nltk.stem import SnowballStemmer
spanish_stemmer = SnowballStemmer('spanish')

# Levantamos la lista de StopWords
f = open('stopwords.txt', 'r', encoding='utf8')
stopwords = f.read().split('\n')
f.close()

In [4]:
def Correccion3D(cadena):
    s1=re.compile(r'(3|tres|tercera|tersera)(\s*)(d|dimension|dimensión|dimensiones|dimencion|dimención|dimenciones)$', re.IGNORECASE)
    s2=re.compile(r'(3|tres|tercera|tersera)(\s*)(d|dimension|dimensión|dimensiones|dimencion|dimención|dimenciones)([^A-ZÁÉÍÓÚa-záéíóú]+)', re.IGNORECASE)
    s=s1.sub(r'3D',cadena)
    s=s2.sub(r'3D\4',s)
    return s

def CorreccionVideoJuegos(cadena):
    s1=re.compile(r'(video|videos)(\s*)(juego|juegos)$', re.IGNORECASE)
    s=s1.sub(r'videojuego',cadena)
    return s

def PreProcesar(Corpus, POS=False, Lema=True, Stem=True):
    
    # Depurar términos posiblemente confusos en las Oraciones
    Corpus = [Correccion3D(oracion) for oracion in Corpus]
    Corpus = [CorreccionVideoJuegos(oracion) for oracion in Corpus]
    
    # Generar una lista de documentos de spacy para tratar el POS Tagging y la Lematización
    docs=[]
    for oracion in Corpus:
        docs.append(nlp(oracion.lower())) #La lematización funciona mejor en minúsculas
    
    # Crear una lista de oraciones, donde cada elemento es una lista de palabras.
    # Cada palabra está definida por una tupla (Texto, POSTag, Lema)
    # Se omiten los tokens que son identificados como signos de puntuación
    oraciones=[]
    for doc in docs:
        oracion=[]
        for token in doc:
            if token.pos_ != 'PUNCT':
                oracion.append((token.text, token.pos_, token.lemma_))
        oraciones.append(oracion)
    
    # Removemos StopWords (finándonos en el lema de cada palabra en vez de su texto!)
    # No conviene quitar las StopWords antes de lematizar pues son útiles para ese proceso...
    oraciones = [[palabra for palabra in oracion if palabra[2] not in stopwords] for oracion in oraciones]
    
    # Stemming
    if Stem==True:
        oraciones_aux=[]
        for oracion in oraciones:
            oracion_aux=[]
            for palabra in oracion:
                p_texto, p_pos, p_lema = palabra
                # Si Lema es True, se Stemmatiza el lema; si no, se Stemmatiza la palabra original
                if Lema==True:
                    oracion_aux.append((p_texto, p_pos, p_lema, spanish_stemmer.stem(p_lema)))
                else:
                    oracion_aux.append((p_texto, p_pos, p_lema, spanish_stemmer.stem(p_texto)))
            oraciones_aux.append(oracion_aux)
        
        oraciones = oraciones_aux
    
    # Finalmente: devolver nuevamente una lista de cadenas como la recibida, pero con el contenido
    # de cada cadena conformado según los parámetros:
    
    Corpus_Procesado = [] #Variable de salida
    
    for doc in oraciones:
        oracion = ''
        for palabra in doc:
            if Stem == True:
                # Devolver cadena de Stemming
                oracion = oracion + palabra[3]
            else:
                if Lema == True:
                    # Devolver cadena de Lemas
                    oracion = oracion + palabra[2]
                else:
                    # Devolver cadena de palabras originales
                    oracion = oracion + palabra[0]
            
            if POS == True:
                #Concatenar POS a cada palabra
                oracion = oracion + '_' + palabra[1].lower()
            
            oracion = oracion + ' '
        
        Corpus_Procesado.append(oracion)
        
    return Corpus_Procesado

def Corregir_Documentos(df_textos, columnas, POS=False, Lema=True, Stem=True):

    for col in columnas:
        df_textos[col] = PreProcesar(list(df_textos[col]), POS, Lema, Stem)
    
    # Sanear el DataFrame eliminando los duplicados y reindexándolo
    df_textos = df_textos.drop_duplicates().reset_index(drop=True)
    
    return df_textos

"""def Generar_Matriz_BOW(df_textos, columna, binario=False, ngram=(1,2)):
    
    # Vectorizar, usando CountVectorizer de sklearn.feature_extraction.text
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizador = CountVectorizer(binary=binario, ngram_range=ngram)
    X = vectorizador.fit_transform(df_textos[columna])
    
    # Generar el DataFrame a devolver
    df_X = pd.DataFrame(X.toarray(), columns=vectorizador.get_feature_names())
    df = df_textos.join(df_X)
    
    return vectorizador, df

def Generar_Matriz_Tfidf(df_textos, columna, ngram=(1,2)):
    
    # Vectorizar... Directamente usar aquí el TfidfVectorizer de sklearn en vez del CountVectorizer
    # (Lleva los mismos parámetros y directamente nos devuelve la matriz con los vectores Tf*Idf)
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizador = TfidfVectorizer(ngram_range=ngram)
    X = vectorizador.fit_transform(df_textos[columna])
    
    # Generar el DataFrame a devolver
    df_X = pd.DataFrame(X.toarray(), columns=vectorizador.get_feature_names())
    df = df_textos.join(df_X)
    
    return vectorizador, df"""

def Distancia_Coseno(u, v):

    distancia = 1.0 - (np.dot(u, v) / (np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v)))))
    return distancia

In [5]:
#1. Cargar y corregir el corpus
df_textos = pd.read_csv('data_carreras.csv', sep=';', encoding = "utf-8")
df_textos = Corregir_Documentos(df_textos,['oracion'],False,True,True)
df_textos

Unnamed: 0,oracion,carrera
0,diseñ multimedi,MM
1,multimedi,MM
2,multi medi,MM
3,mutimedi,MM
4,diseñ grafic,DG
...,...,...
72,simul,Svirt
73,diseñ videojueg,Svirt
74,creacion jueg,Svirt
75,creacion videojueg,Svirt


In [30]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizador1 = CountVectorizer(binary=False,ngram_range=(1,2))#
pp = vectorizador1.fit_transform(df_textos["oracion"])
# Grabar el vectorizador
import pickle
pickle.dump(vectorizador1.vocabulary_, open("vectorizador_carreras_x.pkl", "wb"))
vocabulario = vectorizador1.get_feature_names()
listo = pd.DataFrame(pp.toarray(), columns=vectorizador1.get_feature_names())
listo 


Unnamed: 0,3d,administr,administr contabl,administr empres,administr turist,agenci,agenci public,agricol,agro,agronegoci,...,sistem,sobr,sobr empres,turism,turist,vent,vent exterior,videojueg,virtual,virtual videojueg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
74,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:
transformer = TfidfTransformer()  
listo = transformer.fit_transform(listo)

In [32]:
listo = pd.DataFrame(listo.toarray(), columns=vectorizador1.get_feature_names())
df_textos = df_textos.join(listo)

In [33]:
df_textos

Unnamed: 0,oracion,carrera,3d,administr,administr contabl,administr empres,administr turist,agenci,agenci public,agricol,...,sistem,sobr,sobr empres,turism,turist,vent,vent exterior,videojueg,virtual,virtual videojueg
0,diseñ multimedi,MM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
1,multimedi,MM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
2,multi medi,MM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,mutimedi,MM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
4,diseñ grafic,DG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,simul,Svirt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
73,diseñ videojueg,Svirt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.544144,0.000000,0.0
74,creacion jueg,Svirt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
75,creacion videojueg,Svirt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.510307,0.000000,0.0


In [38]:
#3. Separar el corpus en Train/Test
X = df_textos.drop(['carrera',],axis=1)#"oracion"
y = df_textos[['carrera']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=124)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [39]:
X

Unnamed: 0,oracion,3d,administr,administr contabl,administr empres,administr turist,agenci,agenci public,agricol,agro,...,sistem,sobr,sobr empres,turism,turist,vent,vent exterior,videojueg,virtual,virtual videojueg
0,diseñ multimedi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
1,multimedi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
2,multi medi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
3,mutimedi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
4,diseñ grafic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,simul,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
73,diseñ videojueg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.544144,0.000000,0.0
74,creacion jueg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
75,creacion videojueg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.510307,0.000000,0.0


In [42]:
#Distancia_Coseno del profe

df_comparativa = pd.DataFrame(columns=['Modelo','Umbral','Aciertos',
'Errores','Indeterm','Precision','Recall','Accuracy','F1','df_Aciertos','df_Errores','df_Indeterm'])
def Evaluar_Modelo_Distancia_Coseno(X_train, X_test, y_test, umbral=0.7):
    # Recorrer todo el Test Set prediciendo para cada oración de X_test y comparando con y_test
    global df_comparativa
    df_test = X_test[['oracion']].join(y_test)
    array_X = X_train[X_train.columns[1:]].values
    lista_distancia = [] #Distancia con el elemento más cercano (predicción)
    lista_predic    = [] #Predicción del elemento más cercano
    lista_similar   = [] #Texto del elemento más cercano
    for test_doc in df_test.iterrows():
        i,d = test_doc
        df_query = pd.DataFrame([d['oracion']],columns=['oracion'])
        Q = vectorizador1.transform(df_query['oracion'])
        distancia = [Distancia_Coseno(Q.A[0],fila) for fila in array_X]
        df_Resultado = pd.DataFrame(distancia, columns=['Distancia']).join(X_train[['oracion']]).join(y_train[['carrera']]).sort_values(by='Distancia').head(1)

        lista_distancia.append(df_Resultado.iloc[0]['Distancia'])
        lista_predic.append(df_Resultado.iloc[0]['carrera']) 
        lista_similar.append(df_Resultado.iloc[0]['oracion'])

    # Agregar columnas con resultados predichos al df_test
    df_test['Distancia'] = lista_distancia
    df_test['Predic']    = lista_predic
    df_test['Similar']   = lista_similar

    # Evaluar el resultado en df_test
    print('Cantidad de registros evaluados:', len(df_test))
    print('--------------------')
    aciertos = df_test[ ( df_test['carrera'] == df_test['Predic'] ) & 
                        ( df_test['Distancia'] <= umbral ) ]['carrera'].count()
    errores  = df_test[ ( df_test['carrera'] != df_test['Predic'] ) & 
                        ( df_test['Distancia'] <= umbral) ]['carrera'].count()
    indeterm = df_test[ ( df_test['Distancia'] > umbral) ]['Distancia'].count()
    print('Aciertos:', aciertos)
    print('Errores :', errores)
    print('Indeterm:', indeterm)
    print('--------------------')
    precision = aciertos/(aciertos+errores)
    recall    = aciertos/(aciertos+indeterm)
    accuracy  = (aciertos+indeterm)/(aciertos+errores+indeterm)
    F1        = 2*((precision*recall)/(precision+recall))
    print('Precision: {0:.3f} <- aciertos/(aciertos+errores)'.format(precision))
    print('Recall   : {0:.3f} <- aciertos/(aciertos+indeterm)'.format(recall))
    print('Accuracy : {0:.3f} <- (aciertos+indeterm)/(aciertos+errores+indeterm)'.format(accuracy))
    print('F1       : {0:.3f} <- 2*((precision*recall)/(precision+recall))'.format(F1))

    df_comparativa = df_comparativa.append({'Modelo': 'Distancia coseno',
                                            'Umbral': umbral,
                                            'Aciertos': aciertos,
                                            'Errores': errores,
                                            'Indeterm': indeterm,
                                            'Precision': precision,
                                            'Recall': recall,
                                            'Accuracy': accuracy,
                                            'F1': F1,
                                            'df_Aciertos': df_test[ ( df_test['carrera'] == df_test['Predic'] ) & ( df_test['Distancia'] <= umbral ) ],
                                            'df_Errores' : df_test[ ( df_test['carrera'] != df_test['Predic'] ) & ( df_test['Distancia'] <= umbral ) ],
                                            'df_Indeterm': df_test[ ( df_test['Distancia'] > umbral ) ]
                                            }, ignore_index=True)
    
    return

In [43]:
Evaluar_Modelo_Distancia_Coseno(X_train, X_test, y_test, umbral=0.8)

Cantidad de registros evaluados: 16
--------------------
Aciertos: 12
Errores : 0
Indeterm: 4
--------------------
Precision: 1.000 <- aciertos/(aciertos+errores)
Recall   : 0.750 <- aciertos/(aciertos+indeterm)
Accuracy : 1.000 <- (aciertos+indeterm)/(aciertos+errores+indeterm)
F1       : 0.857 <- 2*((precision*recall)/(precision+recall))


In [44]:
from sklearn.linear_model import LogisticRegression

# Crear el modelo con los parámetros que no cambiarán: observar que no le pasamos el valor de C de regulrización
RLog=LogisticRegression(penalty='none', max_iter=10000, tol=0.00001, multi_class='ovr')

# Armar el diccionario con el nombre y valores para los Hiperparámetros
parametros_RLog = {'C':[1]}

# Armar el GridSearchCV
grid_RLog = GridSearchCV(estimator = RLog,scoring = 'accuracy',param_grid = parametros_RLog, cv = 5,
                        n_jobs = -1)

# Entrenar con el Train Set
grid_RLog.fit(X_train[X_train.columns[1:]].values, y_train);

# Obtener el mejor AC 
AC_RLog_best=grid_RLog.best_score_
print('Mejor accuracy: ' + str(round(AC_RLog_best,4)))

C_RLog_best=grid_RLog.best_params_ 
print('Mejor C: ' + str(C_RLog_best))
###########################################################################################################

###########################################################################################################
def Evaluar_Modelo(modelo, nombre_modelo, X_test, y_test, umbral=0.7):
    # Recorrer todo el Test Set prediciendo para cada oración de X_test y comparando con y_test
    global df_comparativa
    df_test = X_test[['oracion']].join(y_test)
    lista_predic       = [] #Predicción
    lista_probabilidad = [] #Probabilidad de la predicción
    for test_doc in df_test.iterrows():
        i,d = test_doc
        df_query = pd.DataFrame([d['oracion']],columns=['oracion'])
        Q = vectorizador1.transform(df_query['oracion'])
        pronostico = modelo.predict([Q.A[0]])
        probabilidad = modelo.predict_proba([Q.A[0]])
        lista_predic.append(pronostico[0])
        lista_probabilidad.append(probabilidad[0].max())
    
    # Agregar columnas con resultados predichos al df_test
    df_test['Probabilidad'] = lista_probabilidad
    df_test['Predic'] = lista_predic
    
    # Evaluar el resultado en df_test
    print('Cantidad de registros evaluados:', len(df_test))
    print('--------------------')
    aciertos = df_test[ ( df_test['carrera'] == df_test['Predic'] ) & 
                        ( df_test['Probabilidad'] >= umbral ) ]['carrera'].count()
    errores  = df_test[ ( df_test['carrera'] != df_test['Predic'] ) & 
                        ( df_test['Probabilidad'] >= umbral) ]['carrera'].count()
    indeterm = df_test[ ( df_test['Probabilidad'] < umbral) ]['Probabilidad'].count()
    
    print('Aciertos:', aciertos)
    print('Errores :', errores)
    print('Indeterm:', indeterm)
    print('--------------------')
    precision = aciertos/(aciertos+errores)
    recall    = aciertos/(aciertos+indeterm)
    accuracy  = (aciertos+indeterm)/(aciertos+errores+indeterm)
    F1        = 2*((precision*recall)/(precision+recall))
    print('Precision: {0:.3f} <- aciertos/(aciertos+errores)'.format(precision))
    print('Recall   : {0:.3f} <- aciertos/(aciertos+indeterm)'.format(recall))
    print('Accuracy : {0:.3f} <- (aciertos+indeterm)/(aciertos+errores+indeterm)'.format(accuracy))
    print('F1       : {0:.3f} <- 2*((precision*recall)/(precision+recall))'.format(F1))

    # Registrar Resultados
    df_comparativa = df_comparativa.append({'Modelo': nombre_modelo,
                                            'Umbral': umbral,
                                            'Aciertos': aciertos,
                                            'Errores': errores,
                                            'Indeterm': indeterm,
                                            'Precision': precision,
                                            'Recall': recall,
                                            'Accuracy': accuracy,
                                            'F1': F1,
                                            'df_Aciertos': df_test[ ( df_test['carrera'] == df_test['Predic'] ) & ( df_test['Probabilidad'] >= umbral ) ],
                                            'df_Errores' : df_test[ ( df_test['carrera'] != df_test['Predic'] ) & ( df_test['Probabilidad'] >= umbral ) ],
                                            'df_Indeterm': df_test[ ( df_test['Probabilidad'] < umbral ) ]
                                            }, ignore_index=True)
    return

  return f(*args, **kwargs)


Mejor accuracy: 0.7705
Mejor C: {'C': 1}


In [45]:
Evaluar_Modelo(grid_RLog, 'Regresión Logística Sin Regularización', X_test, y_test, umbral=0.8)

Cantidad de registros evaluados: 16
--------------------
Aciertos: 11
Errores : 0
Indeterm: 5
--------------------
Precision: 1.000 <- aciertos/(aciertos+errores)
Recall   : 0.688 <- aciertos/(aciertos+indeterm)
Accuracy : 1.000 <- (aciertos+indeterm)/(aciertos+errores+indeterm)
F1       : 0.815 <- 2*((precision*recall)/(precision+recall))


In [46]:
df_comparativa
#medio sospechoso que de lo mismo 

Unnamed: 0,Modelo,Umbral,Aciertos,Errores,Indeterm,Precision,Recall,Accuracy,F1,df_Aciertos,df_Errores,df_Indeterm
0,Distancia coseno,0.8,12,0,4,1.0,0.75,1.0,0.857143,oracion carrera Distan...,"Empty DataFrame Columns: [oracion, carrera, Di...",oracion carrera Distancia Predic Si...
1,Regresión Logística Sin Regularización,0.8,11,0,5,1.0,0.6875,1.0,0.814815,oracion carrera Probab...,"Empty DataFrame Columns: [oracion, carrera, Pr...",oracion carrera Probabilidad P...


In [47]:
#elejiste Regresión Logística Sin Regularización porque no entendiste muy bien como implementar y exportear la Distancia coseno.

RLog=LogisticRegression(penalty='none', max_iter=10000, tol=0.00001, multi_class='ovr',)

parametros_RLog = {'C':[1]}

grid_RLog1 = GridSearchCV(estimator = RLog,scoring = 'accuracy',param_grid = parametros_RLog, cv = 5, n_jobs = -1)

grid_RLog1.fit(X[X.columns[1:]].values, y)


AC_RLog_best=grid_RLog1.best_score_
print('Mejor accuracy: ' + str(round(AC_RLog_best,4)))

C_RLog_best=grid_RLog1.best_params_ 
print('Mejor C: ' + str(C_RLog_best))



Mejor accuracy: 0.7783
Mejor C: {'C': 1}


  return f(*args, **kwargs)


In [48]:
Evaluar_Modelo(grid_RLog1, 'Regresión Logística Sin Regularización', X_test, y_test, umbral=0.8)

Cantidad de registros evaluados: 16
--------------------
Aciertos: 16
Errores : 0
Indeterm: 0
--------------------
Precision: 1.000 <- aciertos/(aciertos+errores)
Recall   : 1.000 <- aciertos/(aciertos+indeterm)
Accuracy : 1.000 <- (aciertos+indeterm)/(aciertos+errores+indeterm)
F1       : 1.000 <- 2*((precision*recall)/(precision+recall))


In [49]:

nombre_archivo='modelo_carreras_pepe.sav'
pickle.dump(grid_RLog1, open(nombre_archivo, 'wb'))