# Legibility

## Libraries

In [23]:
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
#Generate random integer values
from random import seed
from random import randint
import math
#Regular Expressions
import re 

In [24]:
sent = 'This is an example sentence, and it is actually a beautiful one.'
fdist = FreqDist(word_tokenize(sent))
fdist

FreqDist({'is': 2, 'This': 1, 'an': 1, 'example': 1, 'sentence': 1, ',': 1, 'and': 1, 'it': 1, 'actually': 1, 'a': 1, ...})

## Functions

In [25]:
#Functions:
#The following function receives a string indicating the path to follow and
#and returns the data in the file.
def openFile(s):
    file = open(s)
    data_file = file.read()
    file.close()
    return data_file
#The following function receives the data from the stop word's file, splits them
#and extends them.
def cleanStop(sw):
    csw = [ts for ts in sw.split()]
    esw = ['.', ',',';',':', '/','"', '?', '!', '¡', '<', '>', 'El', '>El']
    csw.extend(esw)
    return csw
#   The following function utilizes beautiful soup to obtain the methodologies
#   and put them into
def bsMeth(s):
    # Use beautiful Soup to separate the methodologies
    with open(s) as fp:
        soup = BeautifulSoup(fp, 'xml')
    all = soup.find_all('Resultados')

    counter = 1
    problem = ""
    for met in soup.find_all('Resultados'):
        [s.extract() for s in met.findAll('Resultados')]
        ######print("*Metodologia", counter)
        #####print(met)
        problem = problem + str(met) + "\n"
        #####print("\n")
        counter += 1
    return problem
#   The following function receives the methodologies and separates them. It
#   creates a dictionary-matrix with the following structure:
#   Methodologies-Number-Word
def sepMeth(justificacion):
    met_matrix = {}
    counter_met = 1
    for met in justificacion:
        met_vector = {}
        counter_word = 1
        met = [d for d in met.split()]
        for word in met:
            met_vector[counter_word] = word
            counter_word += 1
        if(met_vector):
            met_vector.popitem()
        met_matrix["Justificacion" + str(counter_met)] = met_vector
        counter_met += 1
    return met_matrix

#   The following function receives the Stop Words and the Methodologies and
#   removes the stop words from the methodologies (It basically returns the
#   methodologies clean.).
def cleanMeth(sw, meth):
    cleanMeth_matrix = {}
    for m, m_vector in meth.items():
        cleanMeth_vector = {}
        for num, word in m_vector.items():
            word = word.lower()
            word = re.sub('[^a-zñáéíóú]', '', word)
            if (word not in sw) and word:
                cleanMeth_vector[num] = word
        cleanMeth_matrix[m] = cleanMeth_vector
    return cleanMeth_matrix
#   The following function receives the clean methodologies and returns a
#   dictionary with the structure Methodologies-WordVector
def listMeth(meth):
    listMethMatrix = {}
    for m, m_vector in meth.items():
        listMethVector = []
        for num, word in m_vector.items():
            listMethVector.append(word)
        listMethMatrix[m] = listMethVector
    return listMethMatrix
#   The following function gets the frequency of each word and divides them by
#   the amount of words in the document.
def relFreq(meth):
    freqMethMatrix = {}
    for m, m_vector in meth.items():
        freqMethVector = {}
        sizeM = len(m_vector)
        freq = nltk.FreqDist(m_vector)
        for word, frequency in freq.most_common():
            if word not in freqMethVector:
#                 print('Frequency: {}, SizeM: {}'.format(frequency, sizeM))
                freqMethVector[word] = frequency/sizeM
        freqMethMatrix[m] = freqMethVector
    return freqMethMatrix
#   The following function opens the frequency for the most common words in
#   spanish and returns a dictionary with the following structure: Word-Freq
def mostCommon():
    f = open("frecuencia.txt", "r")
    mostCommonVector = {}
    for line in f:
        line = [d for d in line.split()]
        line[2] = line[2].replace(',', '')
        mostCommonVector[line[1]] = line[2]
    return mostCommonVector
#   The following function receives the dictionary with the sections and
#   returns a Dictionary-Matrix with the following form:
#   Section#-Word:LogFrequency
def logFreq(meth, common):
    logFreqMatrix = {}
    for m, methVector in meth.items():
        logFreqVector = {}
        for word in methVector:
            ####print("Word:", word)
            if (word in common):
                ####print("Word:", word)
                ####print("Value:", common[word])
                logFreqVector[word] = np.log(float(common[word]))
            else:
                logFreqVector[word] = 0.0
        logFreqMatrix[m] = logFreqVector
    return logFreqMatrix
#   The following function receives a single dictionary dictionary of specific
#   sections. It returns a vocabulary based on all the words that appear in the
#   the complete set of documents.
def obtainVocSingle(D):
    voc = []
    for s, svec in D.items():
        for word in svec:
            if word not in voc:
                voc.append(word)
    return voc
#   The following function receives two dictionaries of specific sections. It returns
#   It returns a vocabulary based on all the words that appear in the complete set of
#   documents.
def obtainVocDouble(d1, d2):
    voc = []
    for s, svec in d1.items():
        for word in svec:
            if word not in voc:
                voc.append(word)
    for s, svec in d2.items():
        for word in svec:
            if word not in voc:
                voc.append(word)
    return voc

#   The following function creates a vocabulary for all the words that appear in
#   justifications.
def obtainVoc(TSU, Lic, Maestria, Doctorado):
    vocabulary = []
    for m, mvec in TSU.items():
        for word in mvec:
            if word not in vocabulary:
                vocabulary.append(word)
    for m, mvec in Lic.items():
        for word in mvec:
            if word not in vocabulary:
                vocabulary.append(word)
    for m, mvec in Maestria.items():
        for word in mvec:
            if word not in vocabulary:
                vocabulary.append(word)
    for m, mvec in Doctorado.items():
        for word in mvec:
            if word not in vocabulary:
                vocabulary.append(word)
    return vocabulary
#   The following function creates a vocabulary for all the words that appear in
#   justifications.
def obtainVoc3(TSU, Lic, Maestria):
    vocabulary = []
    for m, mvec in TSU.items():
        for word in mvec:
            if word not in vocabulary:
                vocabulary.append(word)
    for m, mvec in Lic.items():
        for word in mvec:
            if word not in vocabulary:
                vocabulary.append(word)
    for m, mvec in Maestria.items():
        for word in mvec:
            if word not in vocabulary:
                vocabulary.append(word)
    return vocabulary
#   The following function utilizes the vocabulary to dimensionate the relative
#   frequency.
def dimRelFreq(matrix, voc):
    dimRelFreqMatrix = {}
    for j, jvec in matrix.items():
        dimRelFreqVec = {}
        for word in voc:
            if word in jvec:
                dimRelFreqVec[word] = jvec[word]
            else:
                dimRelFreqVec[word] = 0.0
        dimRelFreqMatrix[j] = dimRelFreqVec
    return dimRelFreqMatrix

#   The following function subcatenates two matrices, it subtracts one from the
#   vector from the other and return the result in the following form
#   Justification-ResultantVector
def substract(A, B):
    resultantMatrix = {}
    for (a, avec), (b, bvec) in zip(A.items(), B.items()):
        resultantVec = []
        for (wa, va), (wb, vb) in zip(avec.items(), bvec.items()):
            resultantVec.append(va-vb)
        resultantMatrix[a] = resultantVec
    return resultantMatrix
#   The following function substracts entrance by entrance two dictionary
#   vectors. It returns the result of the subtraction but in a vector, no longer
#   a dictionary.
def substractVec(A, B):
    resultantVec = []
    for (wa, va), (wb, vb) in zip(A.items(), B.items()):
        resultantVec.append(va-vb)
    return resultantVec
#   The following function receives two dictionary matrixes, and concatenates
#   their respective vectors row by row
def concatenate(A, B):
    resultantMatrix = []
    for(a, avec), (b, bvec) in zip(A.items(), B.items()):
        resultantVec = []
        resultantVec = avec + bvec
        ###print(len(resultantVec))
        resultantMatrix.append(resultantVec)
    return resultantMatrix
#   The following function receives two list-vectors and concatenates them in
#   the order A+B. It returns the concatenated vector.
def concatenateVec(A, B):
    resultantVec = A + B
    return resultantVec
#   The followign function receives two dictionary list-vector and concatenates
#   them vector-entry by vector-entry. It returns that concatenation.
def concatenateDictionaries(A, B):
    resultantMatrix = []
    for(a, avec), (b, bvec) in zip(A.items(), B.items()):
        resultantVec = []
        for (wa, va) in avec.items():
            resultantVec.append(va)
        for(wb, vb) in bvec.items():
            resultantVec.append(vb)
        resultantMatrix.append(resultantVec)
    return resultantMatrix
#   The following function receives a dictionary-dictionary and returns a
#   a dictionary-list.
def enlist(A):
    resultantMatrix = {}
    for (a, avec) in (A.items()):
        resultantVec = []
        for(w, v) in avec.items():
            resultantVec.append(v)
        resultantMatrix[a] = resultantVec
    return resultantMatrix
#   The following function receives a dictionary-vector and returns a
#   list.
def enlistVec(A):
    resultantVec = []
    for(w, v) in A.items():
        resultantVec.append(v)
    return resultantVec

#   The following function takes two matrixes A, B and returns a training matrix
#   and a classification matrix. The training matrix contains all the related
#   vectors in a single matrix.
def training(A, B):
    trainingMatrix = []
    classMatrix = []
    for av in A:
        trainingMatrix.append(av)
        classMatrix.append(-1.0)
    for bv in B:
        trainingMatrix.append(bv)
        classMatrix.append(1.0)
    return (trainingMatrix, classMatrix)
#   The following function takes fifty vectors from A, B matrixes and stores
#   them in a single matrix (This will be our training matrix). The Function
#   also creates a vector as the classification vector.
def trainingR(A, B, C, D, E, F):
    trainingMatrix = []
    classMatrix = []
    for av in A:
        trainingMatrix.append(av)
        classMatrix.append(-1.0)
    for cv in C:
        trainingMatrix.append(cv)
        classMatrix.append(-1.0)
    for ev in E:
        trainingMatrix.append(ev)
        classMatrix.append(-1.0)
    for bv in B:
        trainingMatrix.append(bv)
        classMatrix.append(1.0)
    for dv in D:
        trainingMatrix.append(dv)
        classMatrix.append(1.0)
    for fv in F:
        trainingMatrix.append(fv)
        classMatrix.append(1.0)
    return(trainingMatrix, classMatrix)
#   The following function receives two dictionaries and returns the vectors
#   in the dictionary concatenated with the other matrix and a classification
#   matrix.
def trainingL(A, B):
    tMatrix = []
    cMatrix = []
    for a, avec in A.items():
        tMatrix.append(avec)
        cMatrix.append(-1.0)
    for b, bvec in B.items():
        tMatrix.append(bvec)
        cMatrix.append(1.0)
    return (tMatrix, cMatrix)
def trainingLR(A, B, C, D, E, F):
    trainingMatrix = []
    classMatrix = []
    for a, avec in A.items():
        trainingMatrix.append(avec)
        classMatrix.append(-1.0)
    for c, cvec in C.items():
        trainingMatrix.append(cvec)
        classMatrix.append(-1.0)
    for e, evec in E.items():
        trainingMatrix.append(evec)
        classMatrix.append(-1.0)
    for b, bvec in B.items():
        trainingMatrix.append(bvec)
        classMatrix.append(1.0)
    for d, dvec in D.items():
        trainingMatrix.append(dvec)
        classMatrix.append(1.0)
    for f, fvec in F.items():
        trainingMatrix.append(fvec)
        classMatrix.append(1.0)
    return(trainingMatrix, classMatrix)
###############################################
#           Science Contribution              #
###############################################

#   The following function receives a dimensionalized vector from one of the
#   possible classes of justifications and returns a test vector.
def obtainTest(A, B):
    # seed random number generator
    seed(1)
    # generate the random number
    r = randint(0, len(A))
    counter = 0
    testRF = {}
    testLF = {}
    for (wordA, vwordA), (wordB, vwordB) in zip(A.items(), B.items()):
        if counter == r:
            testRF = vwordA
            testLF = vwordB
            impWord = wordA
        counter += 1
    A.pop(impWord)
    B.pop(impWord)
    return (testRF, testLF)

#   The following function receives a Matrix and returns a random vector in it.
#   This vector will be use as a representative vector of the class in order
#   to classify a text file as easier or more difficult than it.
def random(A, B):
    seed(1)
    r = randint(0, len(A))
    counter = 0
    randRF = []
    randLF = []
    for (wordA, vwordA), (wordB, vwordB) in zip(A.items(), B.items()):
        if counter == r:
            randRF = vwordA
            randLF = vwordB
            impWord = wordA
        counter += 1
    A.pop(impWord)
    B.pop(impWord)
    return(randRF, randLF)
#   The following function receives a set of vectors and returns a random vector
#   from the collection.
def randomR(A):
    seed()
    print("Length of vector for random choice:", len(A))
    random = randint(0, len(A)-1)
    randVector = A[random]
    return randVector, random
#   The following function receives a set of matrixes and returns a list of random
#   vectors, one for each matrix.
def obtainRandomVectorsR(A, B, C, D):
    vecList = []
    vecPosition = []
    randomVecA, randomPositionA = randomR(A)
    vecList.append(randomVecA)
    vecPosition.append(randomPositionA)
    randomVecB, randomPositionB = randomR(B)
    vecList.append(randomVecB)
    vecPosition.append(randomPositionB)
    randomVecC, randomPositionC = randomR(C)
    vecList.append(randomVecC)
    vecPosition.append(randomPositionC)
    return vecList
#   The following function receives a set of matrixes and returns a list of random
#   vectors, one for each matrix.
def obtainRandomVectors3(A, B, C):
    vecList = []
    vecPosition = []
    randomVecA, randomPositionA = randomR(A)
    vecList.append(randomVecA)
    vecPosition.append(randomPositionA)
    randomVecB, randomPositionB = randomR(B)
    vecList.append(randomVecB)
    vecPosition.append(randomPositionB)
    randomVecC, randomPositionC = randomR(C)
    vecList.append(randomVecC)
    vecPosition.append(randomPositionC)
    return vecList, vecPosition
#   The following function receives eight matrixes and returns two lists: the
#   first list contains all the random RF vectors and the second list contains
#   LF vectors.
def obtainRandomVectors(A, B, C, D, E, F, G, H):
    vecListRF = [None]*4
    vecListLF = [None]*4
    (vecListRF[0], vecListLF[0]) = random(A, B)
    (vecListRF[1], vecListLF[1]) = random(C, D)
    (vecListRF[2], vecListLF[2]) = random(E, F)
    (vecListRF[3], vecListLF[3]) = random(G, H)

    return (vecListRF, vecListLF)
#   The following function receives eight matrixes two test vector related to
#   RF and LF, the general training matrix and its classyfying vector. The
#   function returns the maximum level of difficulty of the text.
def obtainGrade(lRF, lLF, testRF, testLF, tMatrix, CV):
    #Train the SVM
    clf = svm.SVC()
    clf.fit(tMatrix, CV)

    grade = 0
    goodGrade = 0
    ##print(testRF)
    for (vecRF, vecLF) in zip(lRF, lLF):
        testSubRF = substractVec(testRF, vecRF)
        testSubLF = substractVec(testLF, vecLF)
        conTest = concatenateVec(testSubRF, testSubLF)
        result = clf.predict([conTest])
        #print("Resultado: ")
        #print(result[0])
        if (result[0] == -1.0 or goodGrade>=3):
            #print("Entering results:")
            #print("Grade: ", grade)
            if(grade == 0):
                print("Your text is as good as TSU.")
            elif(grade == 1):
                print("Your text is as good as Lic.")
            elif(grade == 2):
                print("Your text is as good as Maestria.")
            elif(grade >= 3):
                print("Your text is as good as Doctorado.")
            break
        else:
            goodGrade += 1

        grade = grade + 1
#   The following function receives a list with the respective centroids, a
#   vector test in order to try and the training matrix with its respective
#   classes.
def obtainGradeR(centroidList, testRF, testLF, tMatrix, CV):
    clf = svm.SVC()
    clf.fit(tMatrix, CV)

    grade = 0
    goodGrade = 0
    #print(testRF)
    #print(len(testRF))
    testRF = enlistVec(testRF)
    testLF = enlistVec(testLF)
    conTest = concatenateVec(testRF, testLF)
    conTest = np.array(conTest)
    #print(conTest)
    #print(len(conTest))
    result = clf.predict([conTest])
    for vec in centroidList:
        vec = np.array(vec)
        #print(vec)
        #print(len(vec))
        subVec = conTest - vec
        #print(subVec)
        #print(len(subVec))
        result = clf.predict([subVec])
        #print(result)
        if (result[0] == -1.0 or goodGrade>=3):
            if(grade == 0):
                print("Your text is as good as TSU.")
            elif(grade == 1):
                print("Your text is as good as Lic.")
            elif(grade == 2):
                print("Your text is as good as Maestria.")
            elif(grade >= 3):
                print("Your text is as good as Doctorado.")
            break
        else:
            goodGrade += 1

        grade = grade + 1
#   The following function receives a test vector and a list of random vectors.
#   It returns the grade of the vector.
def obtainGradeRandom(vector, vector_list, clf):
    grade = 0
    goodGrade = 0
    for vec in vector_list:
        vector = vector - vec
        result = clf.predict([vector])
        #print("Result: ", result)
        if (result[0] == -1.0 or goodGrade>=3):
            if(grade == 0):
                #print("Your text is as good as TSU.")
                return 0
            elif(grade == 1):
                #print("Your text is as good as Lic.")
                return 1
            elif(grade >= 2):
                #print("Your text is as good as Maestria.")
                return 2
            break
        else:
            goodGrade += 1

        grade = grade + 1
#   The following function receives a testMatrix, its vector classification and a
#   list of random vector. It returns the accuracy of the evaluator.
def randomEvaluator(test_matrix, classification, vector_list, clf):
    acc = 0
    print("Test Matrix: ")
    print(test_matrix)
    print("Length of Random: ", len(test_matrix))
    for i in range(len(test_matrix)):
        grade = obtainGradeRandom(test_matrix[i], vector_list, clf)
        #print("Grade: ", grade)
        #print("Classification: ", int(classification[i]))
        if grade == int(classification[i]):
            acc += 1
    return acc/len(test_matrix)
# #   The following function receives a testMatrix, its vector classification and a
# #   list of random vector and the plain justifications. It returns the justification
# #   and its respective classification wether it was wrong or not.
def getJustificationEvaluation(test_matrix, classification, vector_list, clf, justificacionTSU, justificacionLic, justificacionMaestria):
    acc = 0
    print("Test Matrix: ")
    print(test_matrix)
    print("Length of Random: ", len(test_matrix))
    counter = 1
    for i in range(len(test_matrix)):
        grade = obtainGradeRandom(test_matrix[i], vector_list, clf)
        #print("Grade: ", grade)
        #print("Classification: ", int(classification[i]))
        if counter <= 11:
            print("Justificacion de TSU:")
            if grade == int(classification[i]):
                print("Justificacion Correcta: ")
                print(justificacionTSU[counter])
                acc += 1
            else:
                print("Justificacion Incorrecta: ")
                print(justificacionTSU[counter])
            counter += 1
        elif 12<= counter <=23:
            print("Justificaciones de Licenciatura")
            if grade == int(classification[i]):
                print("Justificacion Correcta: ")
                print(justificacionLic[counter])
                acc += 1
            else:
                print("Justificacion Incorrecta: ")
                print(justificacionLic[counter])
            counter += 1
        else:
            print("Justificaciones de Maestria: ")
            if grade == int(classification[i]):
                print("Justificacion Correcta: ")
                print(justificacionMaestria[counter])
                acc += 1
            else:
                print("Justificacion Incorrecta: ")
                print(justificacionMaestria[counter])
            counter += 1

    return acc/len(test_matrix)
#   The following function receives all the justifications for all the different
#   scholar levels and a list for the positions of the random vectors.
#   It returns the random selected vectors and prints them in screen.
def getRandomJustifications(list, A, B, C):
    print("TSU random justification:")
    print(A[list[0]+6])
    print("Lic. random justification: ")
    print(A[list[1]+6])
    print("Maestria random justification: ")
    print(A[list[2]+6])


#   The following function receives two matrixes, one related to the Relative
#   Frequency vectors and one related to the Logarithmic Frequency. It concatenates
#   them, tranforms them into a vector instead of a dictionary and returns the
#   centroid of them all.
def centroid(A, B):
    A = enlist(A)
    B = enlist(B)
    M = concatenate(A, B)
    #Transform into a numpy array
    Marray = np.array(M)
    length = len(Marray)
    centroid = np.zeros(33642)
    for vec in Marray:
        centroid = np.add(centroid, vec)
    centroid = centroid*(1/len(M[0]))
    return centroid

def centroidR(A, B):
    A = enlist(A)
    B = enlist(B)
    M = concatenate(A, B)
    length = len(M)
    centroid = np.zeros(33642)
    for i in range(length):
        for j in range(len(M[0])):
            centroid[j] = centroid[j] + M[i][j]
    for j in range(len(M[0])):
        centroid[j] = centroid[j]/(len(M[0]))
    return centroid
#   The following function receives a matrix and returns its centroid.
def centroidRR(A):
    centroid = np.zeros(len(A[0]))
    for vec in A:
        centroid = centroid + vec
    return centroid/(len(A))
#   The following function receives the test matrix, the classification vector,
#   the centroid list and the svm classifier. It returns the accuracy of the centroid
#   evaluator.
def centroidEvaluator(test_matrix, classification, centroids, clf):
    acc = 0
    for i in range(len(test_matrix)):
        grade = obtainGradeRandom(test_matrix[i], centroids, clf)
        #print("Grade: ", grade)
        #print("Classification: ", int(classification[i]))
        if grade == int(classification[i]):
            acc += 1
    return acc/len(test_matrix)
#   The following function receives four matrixes and returns a list of centroids.
#   One for each matrix.
def obtainCentroids(A, B, C, D):
    list = []
    list.append(centroidRR(A))
    list.append(centroidRR(B))
    list.append(centroidRR(C))
    list.append(centroidRR(D))
    return list
#   The following function receives four matrixes and returns a list of centroids.
#   One for each matrix.
def obtainCentroids3(A, B, C):
    list = []
    list.append(centroidRR(A))
    list.append(centroidRR(B))
    list.append(centroidRR(C))
    return list
#   The following function removes the len zero vectors from the dictionary and
#   returns a dictionary.
def removeZero(GM):
    l = []
    for n, w_vec in GM.items():
        if len(w_vec) == 0:
            l.append(n)
    for e in l:
        del GM[e]
    return GM

#   The follwing function receives a matrix and eliminates the zero lenght vectors
#   from it.
def cleanVector(A):
    return removeZero(A)

#   The following function receives the four grade matrixes and eliminates the
#   zero length vectors from it.
def cleanVectors(A, B, C, D):
    
    return removeZero(A), removeZero(B), removeZero(C), removeZero(D)
#   The following function gets the size of the smallest vector in the dictionary
#   vector-word matrix.
def smallest(GM):
    smallestValue = 1000000
    for n, w_vec in GM.items():
        if smallestValue > len(w_vec):
            smallestValue = len(w_vec)
    return smallestValue
#   The following function gets the size of the biggest vector in the dictionry
#   vector-word matrix.
def biggest(GM):
    biggestValue = 0
    for n, w_vec in GM.items():
        if biggestValue < len(w_vec):
            biggestValue = len(w_vec)
    return biggestValue
#   The following function receives a grade matrix and returns the average size
#   of the vectors in it.
def average(GM):
    averageValue = 0
    allElements = len(GM)
    for n, w_vec in GM.items():
        averageValue = averageValue + len(w_vec)
    return (averageValue/allElements)
#   The following function receives a grade matrix and returns a dictionary with
#   the size of the smallest vector, the biggest vector and the average sizes
#   of the vectors.
def returnSizes(GM):
    sizes = {}
    sizes['Smallest'] = smallest(GM)
    sizes['Biggest'] = biggest(GM)
    sizes['Average'] = average(GM)
    return sizes
#   The following function receives two grade matrixes dimensionalized and concatenated
#   and returns the substraction of both of them.
def subInc(A, B):
    matrix = []
    A = np.array(A)
    B = np.array(B)
    for vecA in A:
        for vecB in B:
            matrix.append(vecA - vecB)
    return matrix
#   The following function receives two matrices dimensionalized and concatenated
#   and returns their conmutative combination in two differente matrices. The plusOne
#   and the minusOne.
def comData(A, B):
    plusOne = subInc(B, A)
    minusOne = subInc(A, B)
    return plusOne, minusOne
#   The following function receives two matrixes dimensionalized and concatenated with
#   vectors with the form: local-global. It returns a matrix which concatenates each vector
#   of the first matrix with eache vector of the second one, thus returning a matrix with 
#   vectors with form: localA-globalA-localB-globalB.
def conMatrixes(A, B):
    concatenated_matrix = []
    A = np.array(A)
    B = np.array(B)
    for vecA in A:
        for vecB in B:
            concatenated_matrix.append(np.hstack((vecA, vecB)))
    return concatenated_matrix 
#   The following function receives two matrices dimensionalized and concatenated
#   with the following form of vectors: (Local and Global). It concatenates the matrixes
#   in both orders PlusOne and MinusOne and returns matrixes with vectors with the 
#   following form: localA-globalA-localB-globalB and localB-globalB-localA-globalA.
def comConData(A, B):
    plusOneMatrix = conMatrixes(B, A)
    minusOneMatrix = conMatrixes(A, B)
    return plusOneMatrix, minusOneMatrix
#   The following function receives a matrix and returns the 80 percent of the values
#   in one matrix and the other twenty in another matrix.
def eightyTwenty(A):
    m = np.array([])
    twenty = math.ceil(len(A)*(1/5))
    for i in range(twenty):
        np.append(m, A[i], 0)
        np.delete(A, i, 0)
    return A, m
#   The following function receives the plusOne and minusOne version for all the grades
#   and returns the training matrix along with its classification vector.
def allTogetherNow(POT, MOT, POL, MOL, POM, MOM):
    M = np.array([])
    y = np.array([])
    M = np.append(POT, POL, 0)
    M = np.append(M, POM, 0)
    M = np.append(M, MOT, 0)
    M = np.append(M, MOL, 0)
    M = np.append(M, MOM, 0)
    y1 = np.ones(len(POT) + len(POL) + len(POM))
    y2 = np.zeros(len(MOT) + len(MOL) + len(MOM))
    y2 = y2-1
    y = np.append(y1, y2, 0)
    return M, y
#   The following function receives plusOne and minusOne version for all the grades
#   and returns the training matrix along with its classification vector.
def allTogetherNow(POT, MOT, POL, MOL):
    M = np.array([])
    y = np.array([])
    M = np.append(POT, POL, 0)
    M = np.append(M, MOT, 0)
    M = np.append(M, MOL, 0)
    y1 = np.ones(len(POT) + len(POL))
    y2 = np.zeros(len(MOT) + len(MOL))
    y2 = y2-1
    y = np.append(y1, y2, 0)
    return M, y
#   The following function receives a plusOne and a minusOne matrix. It returns a 
#   an appended matrix of the both along with its classification vector.
def allTogetherNowDouble(PO, MO):
    M = np.array([])
    y = np.array([])
    M = np.append(PO, MO, axis = 0)
    y1 = np.ones(len(PO))
    y2 = np.zeros(len(MO))
    y = np.append(y1, y2, axis = 0)
    return M, y

#   The following function receives a dictionary-list matrix and returns eighty
#   percent of the vectors in one matrix and the other twenty percent in another
#   matrix.
def divideEightyTwenty(M):
    twenty = {}
    eighty = {}
    counter = 0
    stop = int(len(M)/5)
    for v, vvec in M.items():
        if counter <= stop:
            twenty[v] = vvec
        else:
            eighty[v] = vvec
        counter += 1
    return twenty, eighty
#   The following function receives two matrixes the plusOne and the minusOne and
#   returns it's respective trainingMatrix with their related classification vector.
def togetherNow(A, B):
    M = np.array([])
    y = np.array([])
    M = np.append(A,B,0)
    y1 = np.ones(len(A))
    y2 = np.zeros(len(B))
    y2 = y2-1
    y = np.append(y1, y2, 0)
    return M, y
#   The following function divides our whole training set into 80 percent for
#   training and 20 percent for testing. It returns both matrixes.
def getEightyTwenty(M, y):
    print("Enter Eighty Twenty")
    np.c_[M, y]
    length = len(M)
    testMatrix = np.array([])
    print(int(length*(1/5)))
    seed(1)
    for i in range(int(length*(1/5))):
        print(i)
        random = randint(0, length)
        np.append(testMatrix, M[random])
        np.delete(M, random, 0)
    with open('traingMatrix.txt', 'wb') as f:
        for line in trainingMatrix:
            np.savetxt(f, line, fmt = '%.2f')
    return trainingMatrix, testMatrix
#   The following function receives the M matrix which corresponds to the training
#   objects and the test matrix. It first trains the SVM for later test the accuracy
#   of it five times. Finally it returns the five accuracies along with its standard
#   deviation.
def testAccuracy(M, tM):
    #Get the training Matrix just values
    M = np.array(M)
    y = M[:, len(M[0])-1]
    M = np.delete(M, len(M[0])-1, 1)
    yt = tM[:, len(tM[0])-1]
    tM = np.delete(tM, len(tM[0])-1, 1)
    # Train the svm
    clf = svm.SVC()
    clf.fit(M)
    for i in range(5):
        counter = 0
        for j in range(len(tM)):
            if clf.predict(tM[i]) == y[i]:
                counter += 1
        accuracy.append(counter/len(tM))
    return accuracy
#   The following function receives four matrixes and constructs a single matrix
#   with the vectors of all the other matrixes and a vector with their grade
#   classification.
def testEvaluatorMatrix(A, B, C, D):
    M = np.array([])
    M = np.concatenate((A, B), axis = 0)
    M = np.concatenate((M, C), axis = 0)
    M = np.concatenate((M, D), axis = 0)
    y = np.array([])
    y1 = np.zeros(len(A))
    y2 = np.zeros(len(B)) + 1
    y3 = np.zeros(len(C))  + 2
    y4 = np.zeros(len(D)) + 3
    y = np.concatenate((y1, y2), axis = 0)
    y = np.concatenate((y, y3), axis = 0)
    y = np.concatenate((y, y4), axis = 0)
    return M, y
#   The following function receives four matrixes and constructs a single matrix
#   with the vectors of all the other matrixes and a vector with their grade
#   classification.
def testEvaluatorMatrix3(A, B, C):
    M = np.array([])
    M = np.concatenate((A, B), axis = 0)
    M = np.concatenate((M, C), axis = 0)
    y = np.array([])
    y1 = np.zeros(len(A))
    y2 = np.zeros(len(B)) + 1
    y3 = np.zeros(len(C))  + 2
    y = np.concatenate((y1, y2), axis = 0)
    y = np.concatenate((y, y3), axis = 0)
    return M, y
#   The following function receives a number (amount of specific sections to retrieve) and
#   a dictionary of specific sections.
def getNVector(n, A):
    rA = {}
    counter = 0
    stop = n
    for (a, avec) in A.items():
        if counter < stop: 
            rA[a] = avec
        else:
            break
        counter += 1
    return rA
#   The following function receives four matrixes and a number. It returns the
#   amount of vectors in the number for each of the matrixes.
def getNVectors(n, A, B, C, D):
    rA = {}
    rB = {}
    rC = {}
    rD = {}
    for (a, avec), (b, bvec), (c, cvec), (d, dvec) in zip (A.items(), B.items(), C.items(), D.items()):
        rA[a] = avec
        rB[b] = bvec
        rC[c] = cvec
        rD[d] = dvec
    return rA, rB, rC, rD
#   The following function receives three matrixes and a number. It returns the
#   amount of vectors in the number for each of the matrixes.
def get3NVectors(n, A, B, C):
    rA = {}
    rB = {}
    rC = {}
    counter = 0
    stop = n
    for (a, avec), (b, bvec), (c, cvec) in zip (A.items(), B.items(), C.items()):
        if counter < stop:
            rA[a] = avec
            rB[b] = bvec
            rC[c] = cvec
        else:
            break
        counter+=1
    return rA, rB, rC
#   The following function receives two vectors, actual results for the test vector and the
#   predicted vector produced by our classifier, it then returns the accuracy report for
#   our classifier. Accuracy considered as number of correct predictions divided by total
#   number of predictions made.
def obtain_accuracy(a, b):
    correct = np.where(a == b)
    return np.size(correct)/np.size(a)
    

#   Open the stop words file.
sw = openFile("stopWords.txt")
#   Curate the stop words.
sw = cleanStop(sw)
sw

['a',
 'al',
 'algo',
 'algunas',
 'algunos',
 'ante',
 'antes',
 'como',
 'con',
 'contra',
 'cual',
 'cuando',
 'de',
 'del',
 'desde',
 'donde',
 'durante',
 'e',
 'el',
 'ella',
 'ellas',
 'ellos',
 'en',
 'entre',
 'era',
 'erais',
 'eran',
 'eras',
 'eres',
 'es',
 'esa',
 'esas',
 'ese',
 'eso',
 'esos',
 'esta',
 'estaba',
 'estabais',
 'estaban',
 'estabas',
 'estad',
 'estada',
 'estadas',
 'estado',
 'estados',
 'estamos',
 'estando',
 'estar',
 'estaremos',
 'estará',
 'estarán',
 'estarás',
 'estaré',
 'estaréis',
 'estaría',
 'estaríais',
 'estaríamos',
 'estarían',
 'estarías',
 'estas',
 'este',
 'estemos',
 'esto',
 'estos',
 'estoy',
 'estuve',
 'estuviera',
 'estuvierais',
 'estuvieran',
 'estuvieras',
 'estuvieron',
 'estuviese',
 'estuvieseis',
 'estuviesen',
 'estuvieses',
 'estuvimos',
 'estuviste',
 'estuvisteis',
 'estuviéramos',
 'estuviésemos',
 'estuvo',
 'está',
 'estábamos',
 'estáis',
 'están',
 'estás',
 'esté',
 'estéis',
 'estén',
 'estés',
 'fue',
 'f

In [26]:
!ls

AvancesRecuperacionDeInformacion-3.pdf
Doctorado.xml
DoctoradoCompleto.xml
LegibilityRevisedJustification.py
Licenciatura.xml
LicenciaturaCompleto.xml
Maestria.xml
MaestriaCompleto.xml
ResultsTablesForClassificationI.ipynb
SoftwareRevisionJustification.ipynb
SoftwareRevisionJustificationConcatenation.ipynb
SoftwareRevisionProblem.ipynb
SoftwareRevisionProblemConcatenation.ipynb
SoftwareRevisionResults.ipynb
SoftwareRevisionResultsConcatenation.ipynb
SoftwareRevisionResultsDiv.ipynb
TSU.xml
TSUCompleta.xml
Untitled.ipynb
Untitled1.ipynb
completeTrainingData.txt
frecuencia.txt
justificacionDoctorado.xml
justificacionLic.xml
justificacionMaestria.xml
justificacionTSU.xml
obatinMet9.0.py
obtainMet.py
obtainMet1.0.py
obtainMet10.0.py
obtainMet2.0.py
obtainMet3.0.py
obtainMet4.0.py
obtainMet5.0.py
obtainMet6.0.py
obtainMet6.0py.py
obtainMet7.0.py
obtainMet8.0.py
obtainMet9.0.py
rp_svm.py
stopWords.txt
tesis.xml


# Main

## Obtain Justifications

In [27]:
resultsTSU = bsMeth('TSUCompleta.xml')
resultsLic = bsMeth('LicenciaturaCompleto.xml')
resultsMasters = bsMeth('MaestriaCompleto.xml')
resultsPhd = bsMeth('DoctoradoCompleto.xml')
resultsTSU
resultsLic
resultsMasters
resultsPhd

'<Resultados>En este capítulo se presenta un resumen del proceso seguido en el desarrollo de esta tesis, de los\nresultados más destacables obtenidos, de las ventajas del sistema diseñado y finalmente se\ndescriben los trabajos en curso y las líneas de investigación futuras.\n12.1 Proceso seguido en el desarrollo de la tesis\nNecesidad de evaluación de la usabilidad para sitios Web educativos\nEste trabajo de tesis se inicia con la presentación de los problemas respecto a la evaluación de la\nusabilidad en la Web, encontrándose que no existe una estandarización respecto al qué, cómo y\ncuándo realizarla, sino que se han desarrollado y/o utilizado métodos de manera aislada y con\ncriterios específicos para evaluar un producto particular. En principio se pudo determinar que si bien\nexisten algunas metodologías desarrolladas para la evaluación de usabilidad éstas están orientadas\na las aplicaciones comerciales en la Web, por lo que al ser aplicadas a entornos educativos no\npermiten eva

## Separate Justifications

In [28]:
resultsTSU = [j for j in resultsTSU.split("<Resultados>")]
resultsLic = [j for j in resultsLic.split("<Resultados>")]
resultsMasters = [j for j in resultsMasters.split('<Resultados>')]
resultsPhd = [j for j in resultsPhd.split('<Resultados>')]
resultsTSU
resultsLic
resultsMasters
resultsPhd

['',
 'En este capítulo se presenta un resumen del proceso seguido en el desarrollo de esta tesis, de los\nresultados más destacables obtenidos, de las ventajas del sistema diseñado y finalmente se\ndescriben los trabajos en curso y las líneas de investigación futuras.\n12.1 Proceso seguido en el desarrollo de la tesis\nNecesidad de evaluación de la usabilidad para sitios Web educativos\nEste trabajo de tesis se inicia con la presentación de los problemas respecto a la evaluación de la\nusabilidad en la Web, encontrándose que no existe una estandarización respecto al qué, cómo y\ncuándo realizarla, sino que se han desarrollado y/o utilizado métodos de manera aislada y con\ncriterios específicos para evaluar un producto particular. En principio se pudo determinar que si bien\nexisten algunas metodologías desarrolladas para la evaluación de usabilidad éstas están orientadas\na las aplicaciones comerciales en la Web, por lo que al ser aplicadas a entornos educativos no\npermiten evaluar t

## Structure the Data

In [29]:
TSU_structured = sepMeth(resultsTSU)
Lic_structured = sepMeth(resultsLic)
Masters_structured = sepMeth(resultsMasters)
Phd_structured = sepMeth(resultsPhd)
TSU_structured
Lic_structured
Masters_structured
Phd_structured

{'Justificacion1': {},
 'Justificacion2': {1: 'En',
  2: 'este',
  3: 'capítulo',
  4: 'se',
  5: 'presenta',
  6: 'un',
  7: 'resumen',
  8: 'del',
  9: 'proceso',
  10: 'seguido',
  11: 'en',
  12: 'el',
  13: 'desarrollo',
  14: 'de',
  15: 'esta',
  16: 'tesis,',
  17: 'de',
  18: 'los',
  19: 'resultados',
  20: 'más',
  21: 'destacables',
  22: 'obtenidos,',
  23: 'de',
  24: 'las',
  25: 'ventajas',
  26: 'del',
  27: 'sistema',
  28: 'diseñado',
  29: 'y',
  30: 'finalmente',
  31: 'se',
  32: 'describen',
  33: 'los',
  34: 'trabajos',
  35: 'en',
  36: 'curso',
  37: 'y',
  38: 'las',
  39: 'líneas',
  40: 'de',
  41: 'investigación',
  42: 'futuras.',
  43: '12.1',
  44: 'Proceso',
  45: 'seguido',
  46: 'en',
  47: 'el',
  48: 'desarrollo',
  49: 'de',
  50: 'la',
  51: 'tesis',
  52: 'Necesidad',
  53: 'de',
  54: 'evaluación',
  55: 'de',
  56: 'la',
  57: 'usabilidad',
  58: 'para',
  59: 'sitios',
  60: 'Web',
  61: 'educativos',
  62: 'Este',
  63: 'trabajo',
  64: 'de

## Remove Empty Dictionaries

In [30]:
TSU_structured = cleanVector(TSU_structured)
Lic_structured = cleanVector(Lic_structured)
Masters_structured = cleanVector(Masters_structured)
Phd_structured = cleanVector(Phd_structured)
TSU_structured
Lic_structured
Masters_structured
Phd_structured

{'Justificacion2': {1: 'En',
  2: 'este',
  3: 'capítulo',
  4: 'se',
  5: 'presenta',
  6: 'un',
  7: 'resumen',
  8: 'del',
  9: 'proceso',
  10: 'seguido',
  11: 'en',
  12: 'el',
  13: 'desarrollo',
  14: 'de',
  15: 'esta',
  16: 'tesis,',
  17: 'de',
  18: 'los',
  19: 'resultados',
  20: 'más',
  21: 'destacables',
  22: 'obtenidos,',
  23: 'de',
  24: 'las',
  25: 'ventajas',
  26: 'del',
  27: 'sistema',
  28: 'diseñado',
  29: 'y',
  30: 'finalmente',
  31: 'se',
  32: 'describen',
  33: 'los',
  34: 'trabajos',
  35: 'en',
  36: 'curso',
  37: 'y',
  38: 'las',
  39: 'líneas',
  40: 'de',
  41: 'investigación',
  42: 'futuras.',
  43: '12.1',
  44: 'Proceso',
  45: 'seguido',
  46: 'en',
  47: 'el',
  48: 'desarrollo',
  49: 'de',
  50: 'la',
  51: 'tesis',
  52: 'Necesidad',
  53: 'de',
  54: 'evaluación',
  55: 'de',
  56: 'la',
  57: 'usabilidad',
  58: 'para',
  59: 'sitios',
  60: 'Web',
  61: 'educativos',
  62: 'Este',
  63: 'trabajo',
  64: 'de',
  65: 'tesis',
  66:

## Obtain Size for the Justification Data

In [31]:
sizesTSU = returnSizes(TSU_structured)
sizesLic = returnSizes(Lic_structured)
sizesMasters = returnSizes(Masters_structured)
sizesPhd = returnSizes(Phd_structured)
sizesTSU
sizesLic 
sizesMasters
sizesPhd

{'Smallest': 58, 'Biggest': 3764, 'Average': 723.4545454545455}

## Remove Stop Words and Cleaning

In [32]:
TSU_structured = cleanMeth(sw, TSU_structured)
Lic_structured = cleanMeth(sw, Lic_structured)
Masters_structured = cleanMeth(sw, Masters_structured)
Phd_structured = cleanMeth(sw, Phd_structured)
TSU_structured
Lic_structured
Masters_structured
Phd_structured

{'Justificacion2': {3: 'capítulo',
  5: 'presenta',
  7: 'resumen',
  9: 'proceso',
  10: 'seguido',
  13: 'desarrollo',
  16: 'tesis',
  19: 'resultados',
  21: 'destacables',
  22: 'obtenidos',
  25: 'ventajas',
  27: 'sistema',
  28: 'diseñado',
  30: 'finalmente',
  32: 'describen',
  34: 'trabajos',
  36: 'curso',
  39: 'líneas',
  41: 'investigación',
  42: 'futuras',
  44: 'proceso',
  45: 'seguido',
  48: 'desarrollo',
  51: 'tesis',
  52: 'necesidad',
  54: 'evaluación',
  57: 'usabilidad',
  59: 'sitios',
  60: 'web',
  61: 'educativos',
  63: 'trabajo',
  65: 'tesis',
  67: 'inicia',
  70: 'presentación',
  73: 'problemas',
  74: 'respecto',
  77: 'evaluación',
  80: 'usabilidad',
  83: 'web',
  84: 'encontrándose',
  87: 'existe',
  89: 'estandarización',
  90: 'respecto',
  93: 'cómo',
  95: 'cuándo',
  96: 'realizarla',
  97: 'sino',
  101: 'desarrollado',
  103: 'utilizado',
  104: 'métodos',
  106: 'manera',
  107: 'aislada',
  110: 'criterios',
  111: 'específicos',
  

## Create a Dictionary: Justification#-List of Words (Easier to handle)

In [33]:
TSU_structured = listMeth(TSU_structured)
Lic_structured = listMeth(Lic_structured)
Masters_structured = listMeth(Masters_structured)
Phd_structured = listMeth(Phd_structured)

TSU_structured
Lic_structured
Masters_structured
Phd_structured

{'Justificacion2': ['capítulo',
  'presenta',
  'resumen',
  'proceso',
  'seguido',
  'desarrollo',
  'tesis',
  'resultados',
  'destacables',
  'obtenidos',
  'ventajas',
  'sistema',
  'diseñado',
  'finalmente',
  'describen',
  'trabajos',
  'curso',
  'líneas',
  'investigación',
  'futuras',
  'proceso',
  'seguido',
  'desarrollo',
  'tesis',
  'necesidad',
  'evaluación',
  'usabilidad',
  'sitios',
  'web',
  'educativos',
  'trabajo',
  'tesis',
  'inicia',
  'presentación',
  'problemas',
  'respecto',
  'evaluación',
  'usabilidad',
  'web',
  'encontrándose',
  'existe',
  'estandarización',
  'respecto',
  'cómo',
  'cuándo',
  'realizarla',
  'sino',
  'desarrollado',
  'utilizado',
  'métodos',
  'manera',
  'aislada',
  'criterios',
  'específicos',
  'evaluar',
  'producto',
  'particular',
  'principio',
  'pudo',
  'determinar',
  'si',
  'bien',
  'existen',
  'metodologías',
  'desarrolladas',
  'evaluación',
  'usabilidad',
  'éstas',
  'orientadas',
  'aplicac

## Select a specific number of vectors to work with

In [34]:
TSU_structured = getNVector(len(TSU_structured), TSU_structured)
Lic_structured = getNVector(len(Lic_structured), Lic_structured)
Masters_structured = getNVector(len(Masters_structured), Masters_structured)
Phd_structured = getNVector(len(Phd_structured), Phd_structured)

len(TSU_structured)
len(Lic_structured)
len(Masters_structured)
len(Phd_structured)

55

## Divide our data in eighty and twenty percent (This in order to have some experimental data)

In [35]:
twenty_TSU, eighty_TSU = divideEightyTwenty(TSU_structured)
twenty_Lic, eighty_Lic = divideEightyTwenty(Lic_structured)
twenty_Masters, eighty_Masters = divideEightyTwenty(Masters_structured)
twenty_Phd, eighty_Phd = divideEightyTwenty(Phd_structured)

len(twenty_TSU)
len(eighty_TSU)
len(twenty_Lic)
len(eighty_Lic)
len(twenty_Masters)
len(eighty_Masters)
len(twenty_Phd)
len(eighty_Phd)

43

## Obtain the Relative Frequency

In [36]:
twentyRFTSU = relFreq(twenty_TSU)
eightyRFTSU = relFreq(eighty_TSU)
twentyRFLic = relFreq(twenty_Lic)
eightyRFLic = relFreq(eighty_Lic)
twentyRFMasters = relFreq(twenty_Masters)
eightyRFMasters = relFreq(eighty_Masters)
twentyRFPhd = relFreq(twenty_Phd)
eightyRFPhd = relFreq(eighty_Phd)

len(twentyRFTSU)
len(eightyRFTSU)
len(twentyRFLic)
len(eightyRFLic)
len(twentyRFMasters)
len(eightyRFMasters)
len(twentyRFPhd)
len(eightyRFPhd)

43

## Most Common Spanish Words

In [37]:
mostCommonDic = mostCommon()
mostCommonDic

{'Frec.absoluta': 'Frec.normalizada',
 'de': '9999518',
 'la': '6277560',
 'que': '4681839',
 'el': '4569652',
 'en': '4234281',
 'y': '4180279',
 'a': '3260939',
 'los': '2618657',
 'se': '2022514',
 'del': '1857225',
 'las': '1686741',
 'un': '1659827',
 'por': '1561904',
 'con': '1481607',
 'no': '1465503',
 'una': '1347603',
 'su': '1103617',
 'para': '1062152',
 'es': '1019669',
 'al': '951054',
 'lo': '866955',
 'como': '773465',
 'más': '661696',
 'o': '542284',
 'pero': '450512',
 'sus': '449870',
 'le': '413241',
 'ha': '380339',
 'me': '374368',
 'si': '327480',
 'sin': '298383',
 'sobre': '289704',
 'este': '285461',
 'ya': '274177',
 'entre': '267493',
 'cuando': '257272',
 'todo': '247340',
 'esta': '238841',
 'ser': '232924',
 'son': '232415',
 'dos': '228439',
 'también': '227411',
 'fue': '223791',
 'había': '223430',
 'era': '219933',
 'muy': '208540',
 'años': '203027',
 'hasta': '202935',
 'desde': '198647',
 'está': '194168',
 'mi': '186360',
 'porque': '185700',
 '

## Obtain the Log of the Common Value

In [38]:
twentyLFTSU = logFreq(twenty_TSU, mostCommonDic)
eightyLFTSU = logFreq(eighty_TSU, mostCommonDic)
twentyLFLic = logFreq(twenty_Lic, mostCommonDic)
eightyLFLic = logFreq(eighty_Lic, mostCommonDic)
twentyLFMasters = logFreq(twenty_Masters, mostCommonDic)
eightyLFMasters = logFreq(eighty_Masters, mostCommonDic)
twentyLFPhd = logFreq(twenty_Phd, mostCommonDic)
eightyLFPhd = logFreq(eighty_Phd, mostCommonDic)

twentyLFTSU
eightyLFTSU
twentyLFLic
eightyLFLic
twentyLFMasters
eightyLFMasters
twentyLFPhd
eightyLFPhd

{'Justificacion14': {'conclusiones': 0.0,
  'respecto': 10.47429759323113,
  'objetivos': 0.0,
  'investigación': 10.165890277802454,
  'partir': 10.52615861929175,
  'análisis': 9.880935148065616,
  'resultados': 10.181611592175802,
  'obtenidos': 0.0,
  'través': 10.696751524430846,
  'diversos': 9.593628035139645,
  'instrumentos': 0.0,
  'puntos': 10.221504824783981,
  'fuertes': 0.0,
  'débiles': 0.0,
  'vistos': 0.0,
  'anteriormente': 0.0,
  'podemos': 9.996886816794678,
  'extraer': 0.0,
  'siguientes': 9.580178302441196,
  'propuesto': 0.0,
  'inicio': 0.0,
  'específicos': 0.0,
  'afirmar': 0.0,
  'utilización': 0.0,
  'tic': 0.0,
  'vida': 11.723923557900767,
  'diaria': 0.0,
  'profesorado': 0.0,
  'ef': 0.0,
  'generalizado': 0.0,
  'colectivo': 0.0,
  'docente': 0.0,
  'familiarizado': 0.0,
  'uso': 10.212258189817774,
  'nntt': 0.0,
  'aspecto': 9.650400124848845,
  'contribuye': 0.0,
  'favorablemente': 0.0,
  'hecho': 11.337357054268905,
  'gran': 11.580499997756885,
 

## Create the Vocabulary

In [39]:
voc = obtainVocSingle(TSU_structured)
voc = obtainVocDouble(TSU_structured, Lic_structured)
voc = obtainVoc(TSU_structured, Lic_structured, Masters_structured, Phd_structured)
len(voc)

17125

## Padding (Dimensionate the Vectors based on the size of the Vocabulary)

In [40]:
# For TSU
twentyPadRFTSU = dimRelFreq(twentyRFTSU, voc)
twentyPadLFTSU = dimRelFreq(twentyLFTSU, voc)
eightyPadRFTSU = dimRelFreq(eightyRFTSU, voc)
eightyPadLFTSU = dimRelFreq(eightyLFTSU, voc)
twentyPadRFTSU['Justificacion2']
eightyPadRFTSU
twentyPadLFTSU
eightyPadLFTSU
# For Lic
twentyPadRFLic = dimRelFreq(twentyRFLic, voc)
twentyPadLFLic = dimRelFreq(twentyLFLic, voc)
eightyPadRFLic = dimRelFreq(eightyRFLic, voc)
eightyPadLFLic = dimRelFreq(eightyLFLic, voc)
twentyPadRFLic
twentyPadLFLic
eightyPadRFLic
eightyPadLFLic
# For Masters
twentyPadRFMasters = dimRelFreq(twentyRFMasters, voc)
twentyPadLFMasters = dimRelFreq(twentyLFMasters, voc)
eightyPadRFMasters = dimRelFreq(eightyRFMasters, voc)
eightyPadLFMasters = dimRelFreq(eightyLFMasters, voc)
# For Phd
twentyPadRFPhd = dimRelFreq(twentyRFPhd, voc)
twentyPadLFPhd = dimRelFreq(twentyLFPhd, voc)
eightyPadRFPhd = dimRelFreq(eightyRFPhd, voc)
eightyPadLFPhd = dimRelFreq(eightyLFPhd, voc)

## Concatenate Local and Global Vectors

In [41]:
# TSU
twentyConTSU = concatenateDictionaries(twentyPadRFTSU, twentyPadLFTSU)
eightyConTSU = concatenateDictionaries(eightyPadRFTSU, eightyPadLFTSU)
twentyConLic = concatenateDictionaries(twentyPadRFLic, twentyPadLFLic)
eightyConLic = concatenateDictionaries(eightyPadRFLic, eightyPadLFLic)
twentyConMasters = concatenateDictionaries(twentyPadRFMasters, twentyPadLFMasters)
eightyConMasters = concatenateDictionaries(eightyPadRFMasters, eightyPadLFMasters)
twentyConPhd = concatenateDictionaries(twentyPadRFPhd, twentyPadLFPhd)
eightyConPhd = concatenateDictionaries(eightyPadRFPhd, eightyPadLFPhd)
np.array((twentyConLic)).sum()
np.array((eightyConLic)).sum()
np.array((twentyConMasters)).sum()
np.array((eightyConMasters)).sum()
np.array((twentyConPhd)).sum()
np.array((eightyConPhd)).sum()

28574.605202468814

## Expand and Classify Data

In [51]:
# HST and Undergraduate
twentyPOneTL, twentyMOneTL = comConData(twentyConTSU, twentyConLic)
eightyPOneTL, eightyMOneTL = comConData(eightyConTSU, eightyConLic)
# np.array((twentyPOneTL)).sum()
# np.array((twentyMOneTL)).sum()
# # HST and Masters
# twentyPOneTM, twentyMOneTM = comConData(twentyConTSU, twentyConMasters)
# eightyPOneTM, eightyMOneTM = comConData(eightyConTSU, eightyConMasters)
# HST and Phd
# twentyPOneTP, twentyMOneTP = comConData(twentyConTSU, twentyConPhd)
# eightyPOneTP, eightyMOneTP = comConData(eightyConTSU, eightyConPhd)
# # Undergraduate and Masters
# twentyPOneUM, twentyMOneUM = comConData(twentyConLic, twentyConMasters)
# eightyPOneUM, eightyMOneUM = comConData(eightyConLic, eightyConMasters)
# # Undergraduate and Phd
# twentyPOneUP, twentyMOneUP = comConData(twentyConLic, twentyConPhd)
# eightyPOneUP, eightyMOneUP = comConData(eightyConLic, eightyConPhd)
# Masters and Phd
# twentyPOneMP, twentyMOneMP = comConData(twentyConMasters, twentyConPhd)
# eightyPOneMP, eightyMOneMP = comConData(eightyConMasters, eightyConPhd)

## Append the Data and Create Classification Vector

In [54]:
# HST and Undergraduate
TLM_test, tly_test = allTogetherNowDouble(twentyPOneTL, twentyMOneTL)
len(TLM_test)
len(tly_test)
TLM_train, tly_train = allTogetherNowDouble(eightyPOneTL, eightyMOneTL)
len(TLM_train)
len(tly_train)
# # HST and Masters
# TMM_test, tmy_test = allTogetherNowDouble(twentyPOneTM, twentyMOneTM)
# TMM_train, tmy_train = allTogetherNowDouble(eightyPOneTM, eightyMOneTM)
#HST and Phd
# TPM_test, tpy_test = allTogetherNowDouble(twentyPOneTP, twentyMOneTP)
# TPM_train, tpy_train = allTogetherNowDouble(eightyPOneTP, eightyMOneTP)
# #Undergraduate and Masters
# UMM_test, umy_test = allTogetherNowDouble(twentyPOneUM, twentyMOneUM)
# UMM_train, umy_train = allTogetherNowDouble(eightyPOneUM, eightyMOneUM)
# #Undergraduate and PhD
# UPM_test, upy_test = allTogetherNowDouble(twentyPOneUP, twentyMOneUP)
# UPM_train, upy_train = allTogetherNowDouble(eightyPOneUP, eightyMOneUP)
# #Masters and PhD
# MPM_test, mpy_test = allTogetherNowDouble(twentyPOneMP, twentyMOneMP)
# MPM_train, mpy_train = allTogetherNowDouble(eightyPOneMP, eightyMOneMP)

35404

# HST and Udergraduate

## SVM Classifier

In [55]:
svclassifier = svm.SVC()
svclassifier.fit(TLM_train, tly_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [56]:
y_pred = svclassifier.predict(TLM_test)

print("The accuracy value for the SVM classifier is: {} %".format(obtain_accuracy(tly_test, y_pred)*100))

The accuracy value for the SVM classifier is: 45.41446208112875 %


## K Nearest Neighbors (KNN)

In [57]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(TLM_train, tly_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [58]:
y_pred = neigh.predict(TLM_test)
print('The accurracy value for the KNN classifier is: {} %'.format(obtain_accuracy(tly_test, y_pred)*100))

The accurracy value for the KNN classifier is: 61.19929453262787 %


## Multilayer Perceptron

In [59]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(TLM_train, tly_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [60]:
y_pred = clf.predict(TLM_test)
print("The accuracy value for the Multilayer Perceptron is: {} %".format(obtain_accuracy(tly_test, y_pred)*100))

The accuracy value for the Multilayer Perceptron is: 44.22398589065256 %


# HST and Masters

## SVM Classifier

In [47]:
svclassifier = svm.SVC()
svclassifier.fit(TMM_train, tmy_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [48]:
y_pred = svclassifier.predict(TMM_test)
print("The accuracy value for the SVM classifier is: {} %".format(obtain_accuracy(tmy_test, y_pred)*100))

The accuracy value for the SVM classifier is: 61.76470588235294 %


## K Nearest Neighbors (KNN)

In [49]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(TMM_train, tmy_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [50]:
y_pred = neigh.predict(TMM_test)
print('The accurracy value for the KNN classifier is: {} %'.format(obtain_accuracy(tmy_test, y_pred)*100))

The accurracy value for the KNN classifier is: 53.77073906485671 %


## Multilayer Perceptron

In [51]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(TMM_train, tmy_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [52]:
y_pred = clf.predict(TMM_test)
print("The accuracy value for the Multilayer Perceptron is: {} %".format(obtain_accuracy(tmy_test, y_pred)*100))

The accuracy value for the Multilayer Perceptron is: 56.90045248868778 %


# HST and PhD

## SVM Classifier

In [44]:
svclassifier = svm.SVC()
svclassifier.fit(TPM_train, tpy_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [45]:
y_pred = svclassifier.predict(TPM_test)
print("The accuracy value for the SVM classifier is: {} %".format(obtain_accuracy(tpy_test, y_pred)*100))

The accuracy value for the SVM classifier is: 80.35714285714286 %


## K Nearest Neighbors (KNN)

In [46]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(TPM_train, tpy_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [47]:
y_pred = neigh.predict(TPM_test)
print('The accurracy value for the KNN classifier is: {} %'.format(obtain_accuracy(tpy_test, y_pred)*100))

The accurracy value for the KNN classifier is: 68.25396825396825 %


## Multilayer Perceptron

In [48]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(TPM_train, tpy_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [49]:
y_pred = clf.predict(TPM_test)
print("The accuracy value for the Multilayer Perceptron is: {} %".format(obtain_accuracy(tpy_test, y_pred)*100))

The accuracy value for the Multilayer Perceptron is: 50.0 %


# Undergraduate and Masters

## SVM Classifier

In [50]:
svclassifier = svm.SVC()
svclassifier.fit(UMM_train, umy_train)

NameError: name 'UMM_train' is not defined

In [None]:
y_pred = svclassifier.predict(UMM_test)
print("The accuracy value for the SVM classifier is: {} %".format(obtain_accuracy(umy_test, y_pred)*100))

## K Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(UMM_train, umy_train)

In [None]:
y_pred = neigh.predict(UMM_test)
print('The accurracy value for the KNN classifier is: {} %'.format(obtain_accuracy(umy_test, y_pred)*100))

## Multilayer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(UMM_train, umy_train)

In [None]:
y_pred = clf.predict(UMM_test)
print("The accuracy value for the Multilayer Perceptron is: {} %".format(obtain_accuracy(umy_test, y_pred)*100))

# Undergraduate and PhD

## SVM Classifier

In [30]:
svclassifier = svm.SVC()
svclassifier.fit(UPM_train, upy_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [31]:
y_pred = svclassifier.predict(UPM_test)
print("The accuracy value for the SVM classifier is: {} %".format(obtain_accuracy(upy_test, y_pred)*100))

The accuracy value for the SVM classifier is: 80.24691358024691 %


## K Nearest Neighbors (KNN)

In [32]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(UPM_train, upy_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [33]:
y_pred = neigh.predict(UPM_test)
print('The accurracy value for the KNN classifier is: {} %'.format(obtain_accuracy(upy_test, y_pred)*100))

The accurracy value for the KNN classifier is: 66.35802469135803 %


## Multilayer Perceptron

In [34]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(UPM_train, upy_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [35]:
y_pred = clf.predict(UPM_test)
print("The accuracy value for the Multilayer Perceptron is: {} %".format(obtain_accuracy(upy_test, y_pred)*100))

The accuracy value for the Multilayer Perceptron is: 50.0 %


# Masters and PhD

## SVM Classifier

In [22]:
svclassifier = svm.SVC()
svclassifier.fit(MPM_train, mpy_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [23]:
y_pred = svclassifier.predict(MPM_test)
print("The accuracy value for the SVM classifier is: {} %".format(obtain_accuracy(mpy_test, y_pred)*100))

The accuracy value for the SVM classifier is: 67.78846153846155 %


## K Nearest Neighbors (KNN)

In [24]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(MPM_train, mpy_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [25]:
y_pred = neigh.predict(MPM_test)
print('The accurracy value for the KNN classifier is: {} %'.format(obtain_accuracy(mpy_test, y_pred)*100))

The accurracy value for the KNN classifier is: 56.25 %


## Multilayer Perceptron

In [26]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(MPM_train, mpy_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [27]:
y_pred = clf.predict(MPM_test)
print("The accuracy value for the Multilayer Perceptron is: {} %".format(obtain_accuracy(mpy_test, y_pred)*100))

The accuracy value for the Multilayer Perceptron is: 50.0 %
