# Resultados

## Métricas

### Diferença de Tempo de Resposta

In [52]:
def calculateExpectedTimeDifference(expectedTime, actualTime):
    return abs(expectedTime - actualTime)

## Cálculo de Tempo Resposta e Número de Entradas por Tags

In [5]:
from datetime import datetime

def calculateResponseTime(timeArray):
    d1 = datetime.strptime(timeArray[0], "%Y-%m-%d %H:%M:%S")
    d2 = datetime.strptime(timeArray[1], "%Y-%m-%d %H:%M:%S")
    
    delta = d2 - d1
    
    return delta.seconds / 3600

In [6]:
def calculateTagsEntriesAndTime(df):
    count = {}
    
    for line in df[['Tags','CreationDate','Answer Date']].iterrows():
        line = line[1]
        time = [line[1], line[2]]

        tags = line[0][1:-1].split('><')
        responseTime = calculateResponseTime(time)

        for tag in tags:
            if(tag not in count.keys()):
                dataTag = {
                    'entries': 1,
                    'time': responseTime
                }
                count[tag] = dataTag
            else:
                count[tag]['entries'] += 1
                count[tag]['time'] += responseTime
                
    return count

In [7]:
def calculateMeanTimePerTag(count):
    for tag in count:
        count[tag]['meanTime'] =   count[tag]['time'] /  count[tag]['entries']

In [8]:
def sortTags(count):
    return dict(sorted(count.items(), key=lambda item: -item[1]['entries']))

## Validação do Modelo

In [4]:
from joblib import load
import os
import pandas as pd

cwd = os.getcwd()
df_validation = pd.read_csv(cwd + '/data/processed/posts_val_clean.csv')
df_train_test = pd.read_csv(cwd + '/data/processed/posts_train_test_clean.csv')

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

words_post = df_train_test.complete_text.apply(lambda x: str(x))
#words_post = df_train_test.normalized_text.apply(lambda x: str(x)) #FOR RANDOM FOREST MODEL

vectorizer = TfidfVectorizer(min_df=20)
#vectorizer = TfidfVectorizer() #FOR RANDOM FOREST MODEL

vectorizer.fit(words_post)
X = vectorizer.transform(words_post)

In [103]:
def split_tags(tags):
    tags = tags[1:-1]
    tags = tags.split("><")
    
    return tags

In [137]:
def testModel(df, model, timeTagDict):
    timeDifferenceResults = []
    tagWasCorrectResults = []
    predictedTagResults = []
    
    for index in df.index:
        document = df.loc[index]
        predictedTag = model.predict(X[index])[0]
        timeDifferenceResults.append(calculateExpectedTimeDifference(timeTagDict[predictedTag]['meanTime'], calculateResponseTime([document['CreationDate'], document['Answer Date']])))
        tagWasCorrectResults.append(predictedTag in split_tags(document.Tags))
        predictedTagResults.append(predictedTag)
        
    df = df.assign(timeDifference=timeDifferenceResults)
    df = df.assign(tagWasCorrect=tagWasCorrectResults)
    df = df.assign(predictedTag=predictedTagResults)
    
    return df

In [105]:
def calculateTimeMeanByTag(df, model):
    timeTagDict = {}
    for index in df.index:
        document = df.loc[index]
        tag = model.predict(X[index])[0]
        timeDifference = calculateResponseTime([document['CreationDate'], document['Answer Date']])
        if(tag in timeTagDict.keys()):
            timeTagDict[tag]['totalTime'] += timeDifference
            timeTagDict[tag]['count'] += 1
        else:
            timeTagDict[tag] = {'totalTime': timeDifference, 'count': 1}
    for tag in timeTagDict.keys():
        timeTagDict[tag]['meanTime'] =  timeTagDict[tag]['totalTime'] /  timeTagDict[tag]['count']
    return timeTagDict

In [146]:
from joblib import load
import os
import pandas as pd

cwd = os.getcwd()

df_validation = pd.read_csv(cwd + '/data/processed/posts_val_clean.csv')

#model = load('models/dummy_model.joblib')
#model = load('models/dummy_model_less_tags.joblib')
#model = load('models/lr_model_less_tags.joblib')
model = load('models/lr_model.joblib')
#model = load('models/rf_model.joblib')
#model = load('models/rf_model_less_tags.joblib')

In [None]:
timeTagDict = calculateTimeMeanByTag(df_train_test, model)

In [None]:
df_validation = testModel(df_validation, model, timeTagDict)

In [None]:
accuracyPercentage = df_validation[df_validation['tagWasCorrect'] == True].shape[0] / df_validation.shape[0]
meanTimeDiferenceError = sum(list(df_validation['timeDifference'])) / df_validation.shape[0]
print(accuracyPercentage)
print(meanTimeDiferenceError)