In [1]:
import pandas as pd
from tensorflow import keras
import pickle
from time import time

import sys
sys.path.append('../..')
from smartFAQ.src.cleanTxt import brief_cleaning
from smartFAQ.src.tokenPad import tokenization_padding

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load test data and model

In [2]:
with open('../data/x_test.pickle', 'rb') as handle:
    X_test = pickle.load(handle)

In [3]:
MAX_NUM_WORDS = 20000
MAX_LEN = 3000

In [4]:
# Get the best answer of the question
# @param df : Dataframe
# @param model : a previous saved model
#
# return answer, score
def best_answer(df, model):
    
    df_clean = brief_cleaning(df, ['question', 'answer'])
#     df_pad = tokenization_padding(df_clean, 'answer', ['question'], MAX_NUM_WORDS)
    df_pad = tokenization_padding(df_clean, 'answer', [], MAX_NUM_WORDS, MAX_LEN)
    print(df_pad.shape)
    
    pred = model.predict(df_pad)
    print(pred)
    return pred

## Selection a question with its answers

In [5]:
X_test.sort_values(by=['question'])

Unnamed: 0,id,question,question_body,answer
10084,1196886,'Connection reset peer ' socket error When clo...,We 're developing Python web service client we...,I realize using python I found Java article us...
4701,186636,'from X import ' versus 'import X X ',I 've seen Python programmers use following st...,I prefer import X use X much possible My excep...
46702,1683591,'getattr attribute name must string ' error ad...,I following model set class UserProfile models...,Your problem height field width field contain ...
19531,19185936,'import module ' 'from module import ',I 've tried find comprehensive guide whether b...,There 's another detail mentioned related writ...
40995,15942290,'s good way combinate set,Given set b c 's good way produce b c ab ac ad...,This wild none answers actually provide return...
...,...,...,...,...
16154,597107,xml dom minidom Getting CDATA values,I 'm able get value image tag see XML Category...,p getElementsByTagName 'Category ' firstChild ...
46222,1662458,xml etree ElementTree equivalent Java,I 've quite bit simple XML processing python g...,You might look following alternatives dom j xo...
46225,1663217,xml etree ElementTree equivalent Java,I 've quite bit simple XML processing python g...,It true Java XML APIs greatest terms usability...
46224,1662607,xml etree ElementTree equivalent Java,I 've quite bit simple XML processing python g...,To honest XML APIs Java suck vary level suckag...


# Main

## Try with CNN Model

In [6]:
xml_q = X_test['question'] == 'xml etree ElementTree equivalent Java'
xml_q = X_test[xml_q]
xml_q = xml_q[['question', 'answer']]

In [7]:
xml_q

Unnamed: 0,question,answer
46225,xml etree ElementTree equivalent Java,It true Java XML APIs greatest terms usability...
46222,xml etree ElementTree equivalent Java,You might look following alternatives dom j xo...
46224,xml etree ElementTree equivalent Java,To honest XML APIs Java suck vary level suckag...


In [8]:
CNN_model = keras.models.load_model('../model_builder/DL_models/saved_model/CNN_model_5')

In [9]:
t = time()
answer = best_answer(xml_q, CNN_model)
print('Time to predict: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.0 mins
(3, 3000)
[[0.]
 [0.]
 [0.]]
Time to predict: 0.0 mins


## Try with Decision Tree Model

In [10]:
with open('../data/decision_tree_model.pickle', 'rb') as handle:
    decision_tree_model = pickle.load(handle)
with open('../data/decision_tree_enc.pickle', 'rb') as handle:
    decision_tree_enc = pickle.load(handle)

In [11]:
t = time()

df_selected_question = X_test['question'] == 'xml etree ElementTree equivalent Java'
df_test = X_test[df_selected_question]
df_test = brief_cleaning(df_test, ['question', 'question_body', 'answer'])

df_test = decision_tree_enc.transform(df_test)
pred = decision_tree_model.predict(df_test)
print(pred)
print('Time to predict: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.0 mins


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = sentences


[5.05583827 5.05583827 5.05583827]
Time to predict: 0.0 mins


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e80043e2-6875-4b65-a196-a0ffb97a1282' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>