In [None]:
import os
import json
import pandas as pd
import random
import math
import shutil
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold

In [None]:
def getBase(ironicDataPath, naoIronicDataPath):
    ironicosDF = pd.read_excel(ironicDataPath, index_col=0)
    naorotuladoDF = pd.read_excel(naoIronicDataPath, index_col=0)    
    base = ironicosDF.append(naorotuladoDF, ignore_index=True)
    base['text'] = base['text'].values.astype('U')
    base['rotulo'] = base['rotulo'].values.astype(int)
    base = base.sample(frac=1, replace=True, random_state=17) #use the same random_state for reproducibility
    return base

In [None]:
def getVocabulario(baseToGenerateVocabularyPath, BOWAndNgram):
    vocabularyDF = pd.read_excel(baseToGenerateVocabularyPath, index_col=0)
    vocabularyDF['text'] = vocabularyDF['text'].values.astype('U')
    vocabularyDF['rotulo'] = vocabularyDF['rotulo'].values.astype(int)
    if BOWAndNgram:
        vectorizer = CountVectorizer(lowercase=False, ngram_range=(1,3))
    else:
        vectorizer = CountVectorizer(lowercase=False)
    vectorizer.fit_transform(vocabularyDF["text"])
    print("N features: " + str(len(vectorizer.vocabulary_)) + " [BOW+Ngram]" if BOWAndNgram else " [BOW]")
    return vectorizer

In [None]:
def getFolds(nPartitions, base):
    listIndex = [i for i in base['id'].tolist()]
    nelements = math.ceil(len(listIndex) / nPartitions)

    count = 0
    folds = {}
    for start in range(0, len(listIndex), nelements):
        folds[count] = listIndex[start : start+nelements]
        count += 1
    assert(nPartitions == len(folds))
    return folds

In [None]:
def getPartitionsForTrainingValidationAndTest(nPartitions):
    folds = []
    for n in range(nPartitions):
        partition = {}
        partition['training'] = [] 
        nPartitionsTraining = nPartitions - 2 #considerando 1 particao para treino e 1 para teste
        for i in range(nPartitionsTraining): 
            partition['training'] += [(n + i) % nPartitions]
        partition['validation'] = (n + nPartitionsTraining) % nPartitions
        partition['test'] = (n + nPartitionsTraining + 1) % nPartitions
        folds += [partition]
    return folds

In [None]:
def writeDefinedFolds(foldsPath, folds, foldsWithDefinedPartitions):
    if os.path.exists(foldsPath):
        shutil.rmtree(foldsPath)
        
    os.makedirs(foldsPath)
        
    count = 0        
    for fold in foldsWithDefinedPartitions:
        actualDir = foldsPath + str(count) + "/"
        if not os.path.exists(actualDir):
            os.makedirs(actualDir)
        training = []
        for n in fold['training']:
            training += folds[n]

        with open(actualDir + 'training.json', 'w') as f:
            f.write(json.dumps(training, indent=2))
        with open(actualDir + 'validation.json', 'w') as f:
            f.write(json.dumps(folds[fold['validation']], indent=2))
        with open(actualDir + 'test.json', 'w') as f:
            f.write(json.dumps(folds[fold['test']], indent=2))
        count += 1

In [None]:
def generateFolds(nPartitions, base, foldsPathToWrite):
    foldsDict = getFolds(nPartitions, base)
    foldsWithDefinedPartitions = getPartitionsForTrainingValidationAndTest(nPartitions)
    writeDefinedFolds(foldsPathToWrite, foldsDict, foldsWithDefinedPartitions)

In [None]:
def getDataset(foldsPath, fold, partitionName, base):
    with open(foldsPath + str(fold) + '/' +  partitionName + '.json', 'r') as f:
        idlist = json.loads(f.read())
        dataset = base.loc[base['id'].isin(idlist)]
        return dataset

In [None]:
def getFeaturesAndClass(vectorizerVocabulary, dataset):    
    features = vectorizerVocabulary.transform(dataset['text'])
    return features

In [None]:
def svmClassifier(filelog, kernel, c, bestScore, trainingFeatures, trainingDF, testFeatures, testDF):
    print("C: " + str(c))
    f = open(filelog, 'a')
    f.write("kernel: " + kernel + " c: " + str(c) + "\n")
    
    classifier = svm.SVC(kernel=kernel, C=c)
    classifier.fit(trainingFeatures, trainingDF['rotulo'])
    predict = classifier.predict(testFeatures)

    f.write(json.dumps(classifier.get_params(True), indent=1) + "\n")
    
    report = classification_report(testDF['rotulo'], predict, output_dict=True)
    f.write(json.dumps(report, indent=1) + "\n")
    
    selectedScore = report['micro avg']['precision']
    if(bestScore != None and selectedScore > bestScore['score']):
        bestScore['C'] = c
        bestScore['score'] = selectedScore   
        
    f.write(json.dumps({ 'score': selectedScore, 'bestSocore': bestScore}, indent=1) + "\n")
    f.write("confusion Matrix" + "\n")
    confusionMatrix = confusion_matrix(testDF['rotulo'], predict)
    f.write(str(confusionMatrix) + "\n")
    f.close()
    return bestScore

In [None]:
def crossValidationFold(kernel, Clist, foldsPath, fold, vectorizerVocabulary, base):    
    print("-- Fold: " + str(fold))
    trainingDF = getDataset(foldsPath, fold, 'training', base)
    trainingFeatures = getFeaturesAndClass(vectorizerVocabulary, trainingDF)
    validationDF = getDataset(foldsPath, fold, 'validation', base)
    validationFeatures = getFeaturesAndClass(vectorizerVocabulary, validationDF)
    
    bestScore = {'C' : -1, 'score' : -1}   
    print("Validation ...")
    for c in Clist:
        bestScore = svmClassifier(foldsPath + str(fold) + '/logValidation.txt',
                        kernel, c, bestScore, trainingFeatures, trainingDF, validationFeatures, validationDF)
    
    print("Test ...")
    testDF = getDataset(foldsPath, fold, 'test', base)
    testFeatures = getFeaturesAndClass(vectorizerVocabulary, testDF)
    svmClassifier(foldsPath + str(fold) + '/logTest.txt',
           kernel, bestScore['C'], None,
            trainingFeatures, trainingDF, testFeatures, testDF)
    

In [None]:
def crossValidationKFold(k, root, ironicDataPath, naoIronicDataPath, baseToGenerateVocabularyPath, BOWAndNgramIfTrueAndBOWIfFalse):
    print("Obtendo a base ...")
    base = getBase(ironicDataPath,
                   naoIronicDataPath)
    print("obtendo o vocabulario ...")
    vocabularyVectorizer = getVocabulario(baseToGenerateVocabularyPath,
                                          BOWAndNgramIfTrueAndBOWIfFalse)
    foldsPath = root + str(k) + 'folds/'
    print("gerando os folds ...")
    generateFolds(k, base, foldsPath) 
    Clist = [2**x for x in range(-5, 17, 2)]
    for fold in range(k):
        crossValidationFold('linear', Clist, foldsPath, fold, vocabularyVectorizer, base)
    print("--- END ---")
    

Base rotuladada + coletada

In [None]:
crossValidationKFold(5,
                     'SVM/',
                     'data/ironic.xlsx',
                     'data/notIronic.xlsx',
                     'data/preprocess.xlsx',
                     True)