In [None]:
import os
import json
import pandas as pd
import random
import math
import shutil
from svm import *
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold

In [None]:
def getFolds(nPartitions, base):
    listIndex = [i for i in base['id'].tolist()]
    nelements = math.ceil(len(listIndex) / nPartitions)

    count = 0
    folds = {}
    for start in range(0, len(listIndex), nelements):
        folds[count] = listIndex[start : start+nelements]
        count += 1
    assert(nPartitions == len(folds))
    return folds

In [None]:
def getPartitionsForTrainingValidationAndTest(nPartitions):
    folds = []
    for n in range(nPartitions):
        partition = {}
        partition['training'] = [] 
        nPartitionsTraining = nPartitions - 2 #considerando 1 particao para treino e 1 para teste
        for i in range(nPartitionsTraining): 
            partition['training'] += [(n + i) % nPartitions]
        partition['validation'] = (n + nPartitionsTraining) % nPartitions
        partition['test'] = (n + nPartitionsTraining + 1) % nPartitions
        folds += [partition]
    return folds

In [None]:
def writeDefinedFolds(foldsPath, folds, foldsWithDefinedPartitions):
    if os.path.exists(foldsPath):
        shutil.rmtree(foldsPath)
        
    os.makedirs(foldsPath)
        
    count = 0        
    for fold in foldsWithDefinedPartitions:
        actualDir = foldsPath + str(count) + "/"
        if not os.path.exists(actualDir):
            os.makedirs(actualDir)
        training = []
        for n in fold['training']:
            training += folds[n]

        with open(actualDir + TRAINING_IDS_FILE, 'w') as f:
            f.write(json.dumps(training, indent=2))
        with open(actualDir + VALIDATION_IDS_FILE, 'w') as f:
            f.write(json.dumps(folds[fold['validation']], indent=2))
        with open(actualDir + TEST_IDS_FILE, 'w') as f:
            f.write(json.dumps(folds[fold['test']], indent=2))
        count += 1

In [None]:
def generateFolds(nPartitions, base, foldsPathToWrite):
    foldsDict = getFolds(nPartitions, base)
    foldsWithDefinedPartitions = getPartitionsForTrainingValidationAndTest(nPartitions)
    writeDefinedFolds(foldsPathToWrite, foldsDict, foldsWithDefinedPartitions)

In [None]:
def getDataset(foldsPath, fold, partitionName, base):
    with open(foldsPath + str(fold) + '/' +  partitionName + '.json', 'r') as f:
        idlist = json.loads(f.read())
        dataset = base.loc[base['id'].isin(idlist)]
        return dataset

In [None]:
def getVocabulario(baseToGenerateVocabularyPath, BOWAndNgram, fileToWriteVectorizer):
    vocabularyDF = pd.read_excel(baseToGenerateVocabularyPath, index_col=0)
    vocabularyDF['text'] = vocabularyDF['text'].values.astype('U')
    if BOWAndNgram:
        vectorizer = CountVectorizer(lowercase=False, ngram_range=(1,3))
    else:
        vectorizer = CountVectorizer(lowercase=False)    
    vectorizer.fit_transform(vocabularyDF["text"])
    print("N features: " + str(len(vectorizer.vocabulary_)) + (" [BOW+Ngram]" if BOWAndNgram else " [BOW]"))
    writeObjectInFile(fileToWriteVectorizer, vectorizer)
    return vectorizer

In [None]:
def svmClassifier(root, filename, kernel, c, bestScore, trainingFeatures, trainingDF, testFeatures, testDF, writeObject):
    print("C: " + str(c))
    log = {}
    log["Parametros"] = {"kernel" : kernel, "c" : str(c) }
    
    classifier = svm.SVC(kernel=kernel, C=c)
    classifier.fit(trainingFeatures, trainingDF['rotulo'])
    predict = classifier.predict(testFeatures)     
    savePrediction(testDF, predict, root)
    log["classifier"] = classifier.get_params(True)

    if writeObject:
        writeObjectInFile(root + PYTHON_OBJECT_FILE, classifier)    
    
    report = classification_report(testDF['rotulo'], predict, output_dict=True)
    log["report"] = report
    
    selectedScore = report['macro avg']['precision']
    if(bestScore != None and selectedScore > bestScore['score']):
        bestScore['C'] = c
        bestScore['score'] = selectedScore   
        
    log["score"] = selectedScore
    log["bestScore"] = bestScore
    
    confusionMatrix = confusion_matrix(testDF['rotulo'], predict)
    log["confusionMatrix"] = str(confusionMatrix)

    with open(root + filename, 'a') as filelog:
        filelog.write(json.dumps(log, indent=1))
        filelog.close()
    return bestScore

In [None]:
def crossValidationFold(kernel, Clist, foldsPath, fold, vectorizerVocabulary, base):    
    print("-- Fold: " + str(fold))
    trainingDF = getDataset(foldsPath, fold, 'training', base)
    trainingFeatures = getFeatures(vectorizerVocabulary, trainingDF)
    validationDF = getDataset(foldsPath, fold, 'validation', base)
    validationFeatures = getFeatures(vectorizerVocabulary, validationDF)
    
    bestScore = {'C' : -1, 'score' : -1}   
    print("Validation ...")
    for c in Clist:
        bestScore = svmClassifier(foldsPath + str(fold) + "/", VALIDATION_FILE,
                        kernel, c, bestScore, trainingFeatures, trainingDF, validationFeatures, validationDF, False)
    
    print("Test ...")
    testDF = getDataset(foldsPath, fold, 'test', base)
    testFeatures = getFeatures(vectorizerVocabulary, testDF)
    svmClassifier(foldsPath + str(fold) + "/", TEST_LOG_FILE,
           kernel, bestScore['C'], None,
            trainingFeatures, trainingDF, testFeatures, testDF, True)
    

In [None]:
def crossValidationKFold(k, root, dataBasePath, baseToGenerateVocabularyPath, BOWAndNgramIfTrueAndBOWIfFalse):
    print("Obtendo a base ...")
    base = getBaseDF(dataBasePath)
    print("obtendo o vocabulario ...")
    foldsPath = root + str(k) + 'folds/'
    print("Gerando os folds ...")
    generateFolds(k, base, foldsPath) 
    vocabularyVectorizer = getVocabulario(baseToGenerateVocabularyPath,
                                          BOWAndNgramIfTrueAndBOWIfFalse, foldsPath + VECTORIZER_OBJECT_FILE)  
    Clist = [2**x for x in range(-5, 17, 2)]    
    for fold in range(k):
        crossValidationFold('linear', Clist, foldsPath, fold, vocabularyVectorizer, base)
    print("--- END ---")
    

**Example execution**

ABOSOLUTE_PATH = '../'

crossValidationKFold(5, ABOSOLUTE_PATH + 'SVM/BOWNgram/', ['class1.xlsx', 'class2.xlsx'], 'preprocess.xlsx', True)