In [1]:
import requests
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer

In [2]:
def get_html(url):
    site = requests.get(url)
    html = BeautifulSoup(site.text, 'html.parser')
    return html

<h2>Parsing Wrike's knowledge base

In [3]:
def get_list_of_articles(url):
    help_url = 'https://help.wrike.com'
    res = []
    html = get_html(url)
    articles = html.find('ul', class_ = 'article-list')
    lis = articles.find_all('li')
    for li in lis:
        res.append(help_url + li.find('a').get('href'))
    return res

In [4]:
porter = PorterStemmer()

def text_normalization(text):
    res = []
    if (type(text) != str):
        text = str(text)
    text = text.lower()
    
    #Text cleaning
    trash_symbols = "!”#$%&’'()*+-,.:;<=>?@[]\|/^_`{}~]\"\'"
    for i in range(len(trash_symbols)):
        text = text.replace(trash_symbols[i], '')
    text = text.replace('  ', ' ')
    
    #Tokenizing
    text = text.split(' ')
    #Stamming and using only useful words
    trash_words = ['', 'a', 'an', 'the', 'or']
    for word in text:
        if(word not in trash_words):
            res.append(porter.stem(word))
    return res

In [43]:
def knowledge_base_parsing(url):
    cols = ['url', 'type', 'title', 'norm_title', 'answer']
    df = pd.DataFrame(columns=cols)
    
    html = get_html(url)
    body = html.find('div', class_='article-body')
    if(body.h2 == None):
        if(body.h3 != None):
            text = body.h3.next_element
        else:
            return df
    else:
        text = body.h2.next_element
    
    for k in range(100):
        has_title = 0
        answer = []
        for i in range(100):
            text = text.next_element
            if(str(text).find('<h2>') != -1 or str(text).find('Facebook') != -1): #end of answer or page
                break
            #print('i = ', i, ', text = ', text)
            if(i == 1 and str(text).find('<') == -1):
                has_title = 1
                title = str(text).lower() #raw title
                
                #print('--- How to ' + title + ':')
                norm_title = text_normalization(text)
            if(i > 1 and len(text) > 1):
                if (str(text).find('<') == -1):
                    answer.append(str(text))
                    
        answer = ' '.join(answer)
        #Fixing some artifacts in text
        answer = answer.replace(" .", ".")
        answer = answer.replace("  ", " ")
        answer = answer.replace("\xa0", "")
        #print(answer)
        
        if(has_title == 1):
            #Adding to pandas dataframe
            df = df.append({
                'url' : url,
                'type' : 'How to',
                'title' : title, 
                'norm_title' : norm_title,
                'answer' : answer
            }, ignore_index=True)
        
        if(str(text).find('Facebook') != -1):
            break
        text = text.next_element
    if('overview' == df['norm_title'][0][0]):
        df = df.iloc[1:]
    return df

In [47]:
cols = ['url', 'type', 'title', 'norm_title', 'answer']
data = pd.DataFrame(columns=cols)

In [41]:
knowledgebase1 = [
    #Place here url, linked as 'See all N articles' in categories
    #With category url itself it won't work!
    
    #From category Tasks Folders Projects and Spaces
    'https://help.wrike.com/hc/en-us/sections/201893035-Tasks',
    'https://help.wrike.com/hc/en-us/sections/201834249-Folders-and-Projects',
    'https://help.wrike.com/hc/en-us/sections/202366329-Proofing-and-Approvals',
    'https://help.wrike.com/hc/en-us/sections/201893055-Communication',
    'https://help.wrike.com/hc/en-us/sections/201834259-Advanced',
    #'https://help.wrike.com/hc/en-us/sections/201893045-Organization', #link has some unusual html tree in article
    
    #From category Types of Accounts and Licenses
    'https://help.wrike.com/hc/en-us/sections/201893095-Accounts',
    
    #From category Work Views
    'https://help.wrike.com/hc/en-us/sections/201893075-Gantt-Chart',
    
    #From category Monitoring Panel
    'https://help.wrike.com/hc/en-us/sections/360004618833-Wrike-Resource',
    'https://help.wrike.com/hc/en-us/sections/201893085-Reporting',
    
    #From category Integrations
    'https://help.wrike.com/hc/en-us/sections/202365749-Wrike-GitHub-and-JIRA-Syncs',
    'https://help.wrike.com/hc/en-us/sections/202423365-Everything-Else',
    
     #From category Account Management
    'https://help.wrike.com/hc/en-us/sections/360000935413-Troubleshooting',
    'https://help.wrike.com/hc/en-us/sections/201825379-General-Account-Management',
    'https://help.wrike.com/hc/en-us/sections/201828169-Personal-User-License',
    
     #From category Security
    'https://help.wrike.com/hc/en-us/sections/201893105-Single-Sign-On',
    'https://help.wrike.com/hc/en-us/sections/201834349-More-Security-Features',
    
]

In [48]:
for chapter in  knowledgebase1:
    articles = get_list_of_articles(chapter)
    excluded_urls = [
        'https://help.wrike.com/hc/en-us/articles/210323225-Task-View',
        'https://help.wrike.com/hc/en-us/articles/360024767374-Wrike-Resource-Overview',
    ]
    for url in articles:
        if(url not in excluded_urls): #unusual html tree
            print(url + ' added')
            data = pd.concat([data, knowledge_base_parsing(url)])

https://help.wrike.com/hc/en-us/articles/209603709-Tasks added
https://help.wrike.com/hc/en-us/articles/210323245-Subtasks added
https://help.wrike.com/hc/en-us/articles/115005055469-Subtask-Alignment- added
https://help.wrike.com/hc/en-us/articles/209603309-Attachments added
https://help.wrike.com/hc/en-us/articles/209603689-Milestones added
https://help.wrike.com/hc/en-us/articles/210323205-Backlogged-Tasks added
https://help.wrike.com/hc/en-us/articles/210323325-List-View-Sorting-and-Prioritizing-Tasks added
https://help.wrike.com/hc/en-us/articles/209603889-Mass-Editing added
https://help.wrike.com/hc/en-us/articles/209603849-Sharing-Tasks added
https://help.wrike.com/hc/en-us/articles/209603869-Duplicate-Tasks-and-Subtasks added
https://help.wrike.com/hc/en-us/articles/115005179609-Star-a-Task added
https://help.wrike.com/hc/en-us/articles/209603829-Importance added
https://help.wrike.com/hc/en-us/articles/209603789-Task-Status added
https://help.wrike.com/hc/en-us/articles/209603

https://help.wrike.com/hc/en-us/articles/209605189-Wrike-and-JIRA-Sync-Setup-Guide added
https://help.wrike.com/hc/en-us/articles/360016043273-Connect-Tableau-and-Wrike added
https://help.wrike.com/hc/en-us/articles/115001825869-Microsoft-Teams added
https://help.wrike.com/hc/en-us/articles/211587165-Slack added
https://help.wrike.com/hc/en-us/articles/210409445-Developer-Portal added
https://help.wrike.com/hc/en-us/articles/209605169-Zapier added
https://help.wrike.com/hc/en-us/articles/210324205-G-Suite added
https://help.wrike.com/hc/en-us/articles/209605089-G-Suite-Implementation-Guide added
https://help.wrike.com/hc/en-us/articles/214413365-Google-Calendar-Integration added
https://help.wrike.com/hc/en-us/articles/210324105-Integrated-Cloud-Storage-Apps added
https://help.wrike.com/hc/en-us/articles/360023074273-I-Sent-an-Email-to-Wrike-but-It-Didn-t-Create-or-Update-a-Task-Tasks-Aren-t-Created-out-of-Emails added
https://help.wrike.com/hc/en-us/articles/360021787894-Finding-a-Tas

In [49]:
data.index = range(len(data))
data

Unnamed: 0,url,type,title,norm_title,answer
0,https://help.wrike.com/hc/en-us/articles/20960...,How to,create a task,"[creat, task]",Select a Folder or Project from the left-hand ...
1,https://help.wrike.com/hc/en-us/articles/20960...,How to,assign a task,"[assign, task]",Choose the names of people to whom you want to...
2,https://help.wrike.com/hc/en-us/articles/20960...,How to,schedule a task,"[schedul, task]","In the List view, you can set or change the st..."
3,https://help.wrike.com/hc/en-us/articles/20960...,How to,tag a task (organize tasks into folders),"[tag, task, organ, task, into, folder]","When you tag a task, you're adding it to a Fol..."
4,https://help.wrike.com/hc/en-us/articles/20960...,How to,follow a task,"[follow, task]",Following a task is agreat way to stay up-to-d...
...,...,...,...,...,...
817,https://help.wrike.com/hc/en-us/articles/21032...,How to,authentication apps,"[authent, app]",To use 2-step verification you must have an au...
818,https://help.wrike.com/hc/en-us/articles/21032...,How to,enable 2-step verification (for a user license),"[enabl, 2step, verif, for, user, licens]",Download an authentication app on your mobile ...
819,https://help.wrike.com/hc/en-us/articles/21032...,How to,enable 2-step verification (for all users on a...,"[enabl, 2step, verif, for, all, user, on, subs...",This action must be done by an account admin. ...
820,https://help.wrike.com/hc/en-us/articles/21032...,How to,reconfigure 2-step verification,"[reconfigur, 2step, verif]",If you would like to use 2-step verification w...


In [50]:
#Clear data
data = data.drop_duplicates(subset ="title")
data = data[data['title'].map(len) < 50]
data.index = range(len(data))
data

Unnamed: 0,url,type,title,norm_title,answer
0,https://help.wrike.com/hc/en-us/articles/20960...,How to,create a task,"[creat, task]",Select a Folder or Project from the left-hand ...
1,https://help.wrike.com/hc/en-us/articles/20960...,How to,assign a task,"[assign, task]",Choose the names of people to whom you want to...
2,https://help.wrike.com/hc/en-us/articles/20960...,How to,schedule a task,"[schedul, task]","In the List view, you can set or change the st..."
3,https://help.wrike.com/hc/en-us/articles/20960...,How to,tag a task (organize tasks into folders),"[tag, task, organ, task, into, folder]","When you tag a task, you're adding it to a Fol..."
4,https://help.wrike.com/hc/en-us/articles/20960...,How to,follow a task,"[follow, task]",Following a task is agreat way to stay up-to-d...
...,...,...,...,...,...
384,https://help.wrike.com/hc/en-us/articles/20960...,How to,perform an account backup (with attachments),"[perform, account, backup, with, attach]","You can run a backup, which includes attachmen..."
385,https://help.wrike.com/hc/en-us/articles/21032...,How to,authentication apps,"[authent, app]",To use 2-step verification you must have an au...
386,https://help.wrike.com/hc/en-us/articles/21032...,How to,enable 2-step verification (for a user license),"[enabl, 2step, verif, for, user, licens]",Download an authentication app on your mobile ...
387,https://help.wrike.com/hc/en-us/articles/21032...,How to,reconfigure 2-step verification,"[reconfigur, 2step, verif]",If you would like to use 2-step verification w...


In [83]:
data.to_csv('how_to_data.csv', index=False)

<h2>TF-IDF model

In [None]:
data = pd.read_csv('how_to_data.csv')

In [51]:
corpus = list(map(' '.join, data['norm_title'].values))
corpus.remove('\n')
corpus.remove('')

In [52]:
temp = []
for doc in corpus:
    temp.append(doc.replace('\xa0', ' '))
corpus = temp

In [53]:
corpus = ['How to ' + s for s in corpus]

In [54]:
corpus

['How to creat task',
 'How to assign task',
 'How to schedul task',
 'How to tag task organ task into folder',
 'How to follow task',
 'How to unfollow task',
 'How to print task',
 'How to add depend to task',
 'How to view task',
 'How to import inform',
 'How to creat a subtask',
 'How to view task subtask',
 'How to edit subtask',
 'How to convert task into subtask',
 'How to convert subtask into task',
 'How to add subtask to multipl parent task',
 'How to chang subtask parent task',
 'How to reorder subtask on list view',
 'How to expand parent task date',
 'How to shift task date',
 'How to except',
 'How to disabl align popup',
 'How to attach file',
 'How to edit attach file',
 'How to creat googl drive file from wrike',
 'How to download attach',
 'How to delet attach',
 'How to more info',
 'How to creat mileston task',
 'How to mileston on gantt chart',
 'How to sort option',
 'How to chang how task are sort',
 'How to mass edit task',
 'How to avail mass edit option',
 'H

In [56]:
vectorizer = TfidfVectorizer()
trans_matrix = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['2step', '365', 'about', 'access', 'account', 'accountwid', 'action', 'ad', 'add', 'addit', 'adjust', 'admin', 'advanc', 'aggreg', 'align', 'all', 'and', 'anoth', 'app', 'appli', 'approv', 'are', 'arrang', 'as', 'assign', 'attach', 'attribut', 'authent', 'autoassign', 'automat', 'avail', 'avatar', 'back', 'backlog', 'backup', 'base', 'be', 'befor', 'benefit', 'best', 'between', 'bill', 'bitbucket', 'brand', 'browser', 'builder', 'busi', 'by', 'bynder', 'calcul', 'calendar', 'can', 'case', 'caus', 'center', 'chain', 'chang', 'charact', 'chart', 'check', 'choos', 'collaps', 'column', 'command', 'comment', 'compani', 'comparison', 'compat', 'configur', 'confirm', 'constraint', 'contact', 'convert', 'copi', 'creat', 'credenti', 'critic', 'csv', 'custom', 'daili', 'dashboard', 'data', 'date', 'day', 'deactiv', 'decid', 'default', 'delet', 'depend', 'detail', 'disabl', 'document', 'domain', 'down', 'download', 'drive', 'duplic', 'durat', 'edit', 'editor', 'effort', 'email', 'empti', 'enabl'

In [57]:
len(vectorizer.get_feature_names())

378

In [79]:
def predict(request):
    norm_request = text_normalization(request)
    request_vector = vectorizer.transform([' '.join(norm_request)])
    
    #Finding max dot product (max similarity of sentences)
    n = 0
    max_dot = 0
    pred_title = ''
    for title in trans_matrix:
        res = np.dot(request_vector.toarray(), title.toarray().T)
        if(res > max_dot):
            max_dot = res
            pred_title = corpus[n]
        n += 1
    
    pred_row = data['How to ' + data['norm_title'].map(' '.join) == pred_title]
    pred_row.index = range(len(pred_row))
    if(len(pred_row) == 0):
        return "Couldn\'t find answer"
    else:
        title = 'How to ' + pred_row['title'][0]
        answer = pred_row['answer'][0]
        return title, answer

<h2>Examples of requests

In [80]:
predict('How to create a task')

('How to create a task',
 'Select a Folder or Project from the left-hand Folder tree (this is where your task will be created). Click the green plus sign in the Workspace\'s upper left-hand corner and select "Task". Type a task name and press "Enter". Your task is created and you can begin adding task attributes from the Task View (assignee, due dates, and more) or right-click on a task from the List View to access quick edit options.')

In [81]:
predict('I need create a task')

('How to create a task',
 'Select a Folder or Project from the left-hand Folder tree (this is where your task will be created). Click the green plus sign in the Workspace\'s upper left-hand corner and select "Task". Type a task name and press "Enter". Your task is created and you can begin adding task attributes from the Task View (assignee, due dates, and more) or right-click on a task from the List View to access quick edit options.')

In [82]:
predict('I need change view')

('How to change the default view',
 'Regular and External Users on all account types can change the default view of Folders and Projects. Change the default view for existing Folders and Projects. Right click on a Folder/Project from the Folder tree in the left-hand Navigation panel. Hover over “Set default view”. Select a view. The Folder/Project opens in this view whenever you (or anyone who the Folder is shared with) opens it.')