In [1]:
import pandas as pd
import nltk

In [2]:
from joblib import dump, load

## Data for validation

In [3]:
data = pd.read_csv('all_dataset.csv')
data = data.drop(columns='Unnamed: 0')

In [4]:
data.groupby(by='source').count()

Unnamed: 0_level_0,answer_text,questions,url_1,url_2,url_3,url_4,section_1,section_2,section_3,section_4,section_text,keywords,text_len
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
from_alex,0,0,1093,1093,1093,1093,1093,1093,1093,1090,1093,0,0
from_amir,0,0,1272,1272,1272,1272,1272,1272,1272,1269,1260,1261,1261
from_crawl_spider,168,168,168,168,168,27,168,168,168,27,0,0,0
from_dirty_questions,207,191,207,207,207,60,207,207,207,60,0,0,0
from_wrike_org,0,168,168,168,168,41,168,168,168,35,0,0,0


In [96]:
validate_df = data[(data['source'] == 'from_wrike_org') & (data['questions'] != '')]

In [97]:
validate_df = validate_df[['questions', 'section_1', 
                           'section_2', 'section_3', 'section_4']].rename(columns={'questions':'text'})

In [6]:
generated = pd.read_csv('dataset_text_with_questions.csv')
generated = generated.drop(columns='Unnamed: 0')

In [7]:
generated.groupby(by='source').count()

Unnamed: 0_level_0,answer_text,questions,url_1,url_2,url_3,url_4,section_1,section_2,section_3,section_4,section_text,keywords,text_len,col_of_questions
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
from_alex,0,2797,2797,2797,2797,2797,2797,2797,2797,2791,2797,0,0,2797
from_amir,0,3562,3562,3562,3562,3562,3562,3562,3562,3551,3562,3562,3562,3562


In [306]:
from_alex = data[data['source'] == 'from_alex'][['section_text','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'section_text':'text'})
from_amir = data[data['source'] == 'from_amir'][['section_text','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'section_text':'text'})
from_crawl_spider = data[data['source'] == 'from_crawl_spider'][['questions','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'questions':'text'})
from_dirty_question = data[data['source'] == 'from_dirty_question'][['questions','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'questions':'text'})
#generated_train = generated[['questions','section_1','section_2', 
#                             'section_3', 'section_4']].rename(columns={'questions':'text'})

In [307]:
train_data = pd.concat([from_amir, from_crawl_spider, from_dirty_question], 
                       ignore_index=True)

In [308]:
train_data.head()

Unnamed: 0,text,section_1,section_2,section_3,section_4
0,Create a Task\r\n\r\nSelect a Folder or Projec...,"Tasks, Folders, Projects and Spaces",Tasks,Tasks,Create a Task
1,Assign a Task\r\nChoose the names of people to...,"Tasks, Folders, Projects and Spaces",Tasks,Tasks,Assign a Task
2,"Schedule a Task\r\nIn the List view, you can s...","Tasks, Folders, Projects and Spaces",Tasks,Tasks,Schedule a Task
3,Tag a Task (Organize Tasks into Folders)\r\nWh...,"Tasks, Folders, Projects and Spaces",Tasks,Tasks,Tag a Task (Organize Tasks into Folders)
4,Follow a Task\r\nFollowing a task is a great w...,"Tasks, Folders, Projects and Spaces",Tasks,Tasks,Follow a Task


In [309]:
train_data.shape

(1440, 5)

In [310]:
information_section_list = [
       'Wrike and Bitbucket Sync: Overview',
       'Wrike and GitLab Sync: Overview',
       'Wrike and GitHub Sync: Overview',
       'Wrike and JIRA Sync: Overview',
       'Wrike and JIRA Sync: Setup Guide','Step 1: Learning the Fundamentals',
       'Step 2: Setting Up Your Workspace',
       'Step 3: Team Onboarding and Collaboration',
       'Step 4: Monitoring Work Progress',
       'Step 5: Tailoring Wrike to Your Team’s Needs',
       'Step 6. Making Remote Work Productive', 'Step 1: Getting Settled',
       'Step 2: Launching New Projects', 'Step 3: Collaborating in Wrike',
       'Step 4: Work from Home Productively']

In [311]:
train_data = train_data[~train_data['section_3'].isin(information_section_list)]
train_data = train_data[train_data['section_4'] != 'Overview']
train_data = train_data[train_data['text'] != '']

In [312]:
train_data.shape

(1163, 5)

In [313]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

In [314]:
from nltk.corpus import stopwords

english_stopwords = stopwords.words("english")

In [315]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [316]:
#Preprocess function
def preprocess_text(text, tokenizer=RegexpTokenizer(r'\w+'), 
                    lemmatizer=WordNetLemmatizer(), stopwords=stopwords.words("english")):
    tokens = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in english_stopwords\
              and token != " " ]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    clean_text = " ".join(lemmas)
    return clean_text

In [317]:
train_data['text_cleaned'] = train_data['text'].apply(lambda x: preprocess_text(str(x)))

In [318]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,1))
TF_X = vectorizer.fit_transform(train_data['text_cleaned'])

In [319]:
section_4_data = train_data.dropna(subset=['section_4'])
section_4_data = section_4_data[section_4_data['section_4'] != 'Important Information']
TF_X_4 = vectorizer.transform(section_4_data['text_cleaned'])

## Dumping vectorizer

In [320]:
dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

In [321]:
from sklearn.preprocessing import OneHotEncoder

section_1_encoder = OneHotEncoder(handle_unknown='ignore')
section_2_encoder = OneHotEncoder(handle_unknown='ignore')
section_3_encoder = OneHotEncoder(handle_unknown='ignore')
section_4_encoder = OneHotEncoder(handle_unknown='ignore')

section_1_ohe = section_1_encoder.fit_transform(train_data['section_1'].values.reshape(-1, 1))
section_2_ohe = section_2_encoder.fit_transform(train_data['section_2'].values.reshape(-1, 1))
section_3_ohe = section_3_encoder.fit_transform(train_data['section_3'].values.reshape(-1, 1))
section_4_ohe = section_4_encoder.fit_transform(section_4_data['section_4'].values.reshape(-1, 1))

## Dumping encoders

In [322]:
dump(section_1_encoder, 'section_1_encoder.joblib')
dump(section_2_encoder, 'section_2_encoder.joblib')
dump(section_3_encoder, 'section_3_encoder.joblib')
dump(section_4_encoder, 'section_4_encoder.joblib')

['section_4_encoder.joblib']

In [323]:
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

In [324]:
def predict_one_class(train_data, submit_data, y, model, n_splits=3, silent=False):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    to_check = pd.DataFrame(train_data.copy())
    to_check[f'predict_proba'] = 0
    pred = np.zeros(len(submit_data))
    y_arr = y
    for i, (train_index, valid_index) in enumerate(skf.split(train_data, y_arr)):        
        X_train = train_data[train_index]
        X_valid = train_data[valid_index]
        y_train = y_arr[train_index]
        y_valid = y_arr[valid_index] 
        
        model.fit(X_train, y_train)
        
        prediction_skf = model.predict_proba(submit_data)[:,1]
        pred += prediction_skf / skf.n_splits  
        
        prediction = model.predict_proba(X_valid)[:,1]
        to_check.loc[valid_index, f'predict_proba'] = prediction
                
    local_auc = roc_auc_score(y, to_check[f'predict_proba'])
    
    if not silent: 
        print(f"ROC_AUC_local: {local_auc}")
    
    return pred,to_check[f'predict_proba'],local_auc

In [325]:
import numpy as np

In [326]:
Y = section_1_ohe.toarray()
X_train, X_test, Y_train, Y_test = train_test_split(TF_X.toarray(), Y, test_size=0.2)

cv_scores = []
probas = []
cross_vals = []

columns = Y.shape[1]
for i in range(columns):
    print(section_1_encoder.categories_[0][i])
    y_train = Y_train[:, i]
    clf = LogisticRegression()
    proba, cross_val, cv_score = predict_one_class(X_train, X_test, y_train, clf, n_splits=3)  
    probas.append(proba)
    cross_vals.append(cross_val)
    cv_scores.append(cv_score)
print ()
print(f'TOTAL CV:{np.mean(cv_scores)}')

Account Management
ROC_AUC_local: 0.9332879420162531
Apps
ROC_AUC_local: 0.9458931199411333
Integrations
ROC_AUC_local: 0.9576719576719577
Monitoring Panel
ROC_AUC_local: 0.945242178406092
Security
ROC_AUC_local: 0.9575737218837591
Tasks, Folders, Projects and Spaces
ROC_AUC_local: 0.9323251110815877
Types of Accounts and Licenses
ROC_AUC_local: 0.9561003719419561
Work Views
ROC_AUC_local: 0.9487903670230353

TOTAL CV:0.9471105962457218


In [327]:
import warnings
warnings.filterwarnings("ignore")

In [328]:
Y = section_2_ohe.toarray()

X_train, X_test, Y_train, Y_test = train_test_split(TF_X.toarray(), Y, test_size=0.2)

cv_scores = []
probas = []
cross_vals = []

columns = Y.shape[1]
for i in range(columns):
    print(section_2_encoder.categories_[0][i])
    y_train = Y_train[:, i]
    clf = LogisticRegression()
    try:
        proba, cross_val, cv_score = predict_one_class(X_train, X_test, y_train, clf, n_splits=3)
    except:
        pass
    probas.append(proba)
    cross_vals.append(cross_val)
    cv_scores.append(cv_score)
print ()
print(f'TOTAL CV:{np.mean(cv_scores)}')

Accounts
ROC_AUC_local: 0.9732969034608379
Add-ins and Extensions
ROC_AUC_local: 0.9297737306843268
Advanced
ROC_AUC_local: 0.9496131766473743
All the Other Views
ROC_AUC_local: 0.8522709413798523
Automated Monitoring
ROC_AUC_local: 0.665948275862069
Billing
ROC_AUC_local: 0.8950416214259862
Calendars
ROC_AUC_local: 0.9801695842450766
Communication
ROC_AUC_local: 0.9068456181778753
Desktop
ROC_AUC_local: 0.9906024334751211
Email
ROC_AUC_local: 0.988537037037037
Everything Else
ROC_AUC_local: 0.9638030888030888
Folders and Projects
ROC_AUC_local: 0.9754189944134077
Gantt Chart
ROC_AUC_local: 0.9615096837319059
General Account Management
ROC_AUC_local: 0.9477582846003898
Import and Export
ROC_AUC_local: 0.983138998406174
Licenses
ROC_AUC_local: 0.988095238095238
Mobile
ROC_AUC_local: 0.9174499089253187
More Monitoring Panel Views
ROC_AUC_local: 0.990718232044199
More Security Features
ROC_AUC_local: 0.958574299659571
Organization
ROC_AUC_local: 0.8105494505494506
Personal User License
RO

In [329]:
from tqdm import tqdm

In [330]:
Y = section_3_ohe.toarray()

X_train, X_test, Y_train, Y_test = train_test_split(TF_X.toarray(), Y, test_size=0.2)

cv_scores = []
probas = []
cross_vals = []

columns = Y.shape[1]
for i in tqdm(range(columns)):
    y_train = Y_train[:, i]
    clf = LogisticRegression()
    try:
        proba, cross_val, cv_score = predict_one_class(X_train, X_test, y_train, clf, n_splits=3, silent=True)  
    except ValueError: ## ошибка недостаточного кол-ва примеров
        pass
    probas.append(proba)
    cross_vals.append(cross_val)
    cv_scores.append(cv_score)
print ()
print(f'TOTAL CV:{np.mean(cv_scores)}')

100%|██████████| 224/224 [01:37<00:00,  2.30it/s]


TOTAL CV:0.8926643034522124





In [331]:
Y = section_1_ohe.toarray()
base_lr = LogisticRegression()
section_1_ovr = OneVsRestClassifier(base_lr)
section_1_ovr.fit(TF_X.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [332]:
Y = section_2_ohe.toarray()
base_lr = LogisticRegression()
section_2_ovr = OneVsRestClassifier(base_lr)
section_2_ovr.fit(TF_X.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [333]:
Y = section_3_ohe.toarray()
base_lr = LogisticRegression()
section_3_ovr = OneVsRestClassifier(base_lr)
section_3_ovr.fit(TF_X.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [334]:
Y = section_4_ohe.toarray()
base_lr = LogisticRegression()
section_4_ovr = OneVsRestClassifier(base_lr)
section_4_ovr.fit(TF_X_4.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

## Dumping models

In [335]:
dump(section_1_ovr, 'section_1_ovr.joblib')
dump(section_2_ovr, 'section_2_ovr.joblib')
dump(section_3_ovr, 'section_3_ovr.joblib')
dump(section_4_ovr, 'section_4_ovr.joblib')

['section_4_ovr.joblib']

In [336]:
def predict_question_classes(text, vectorizer, encoders=[], models=[]):
    probable_classes = []
    clean_text = preprocess_text(text)
    X = [clean_text]
    vectorized = vectorizer.transform(X)
    for encoder, model in zip(encoders, models):
        prediction = model.predict_proba(vectorized)[0]
        categories = encoder.categories_[0]
        answer = sorted(zip(prediction, categories), reverse=True)[0]
        probable_classes.append(f'{answer[1]}: {answer[0]}')
    return probable_classes

In [339]:
predict_question_classes('how export to excel', vectorizer=vectorizer, 
                         encoders=[section_1_encoder, section_2_encoder, section_3_encoder], 
                         models=[section_1_ovr, section_2_ovr, section_3_ovr])

['Integrations: 0.33547559084839956',
 'Import and Export: 0.14725507237002888',
 'Wrike Analyze: 0.014321643501742852']

In [340]:
def get_validate_results(df, vectorizer, encoders=[], models=[], sections=[]):
    data = df.reset_index(drop=True)
    data['clean_text'] = data['text'].apply(lambda x: preprocess_text(str(x)))
    X = vectorizer.transform(data['clean_text'])
    for i, row in enumerate(X):
        for encoder, model, section in zip(encoders, models, sections):
            prediction = model.predict_proba(row)[0]
            categories = encoder.categories_[0]
            answer = sorted(zip(prediction, categories), reverse=True)[0]
            data.at[i, section] = answer[1]
    return data

In [341]:
answer_df = get_validate_results(validate_df, vectorizer=vectorizer, 
                         encoders=[section_1_encoder, section_2_encoder, section_3_encoder], 
                         models=[section_1_ovr, section_2_ovr, section_3_ovr],
                         sections=['section_1_pred', 'section_2_pred', 'section_3_pred'])

In [342]:
answer_df.head()

Unnamed: 0,text,section_1,section_2,section_3,section_4,clean_text,section_1_pred,section_2_pred,section_3_pred
0,"Hi again, is there a way to auto-assign based...","Tasks, Folders, Projects and Spaces",Advanced,Advanced Custom Workflows,,hi way auto assign based status project,"Tasks, Folders, Projects and Spaces",Advanced,Custom Statuses and Workflows
1,Hi there! How do I access the workload feature?,Monitoring Panel,Wrike Resource,Workload Charts,,hi access workload feature,"Tasks, Folders, Projects and Spaces",Wrike Resource,Workload Charts
2,How about the attachments (if i back up with a...,Security,More Security Features,Account Backup,Perform an Account Backup (with attachments),attachment back attachment available,"Tasks, Folders, Projects and Spaces",Tasks,Attachments
3,"How can me allocate a ""'budget"" on a task ?","Tasks, Folders, Projects and Spaces",Advanced,Custom Fields,"Create, Add, and View Custom Fields",allocate budget task,"Tasks, Folders, Projects and Spaces",Tasks,Milestones
4,"Thanks, when it expires, what features are goi...",Types of Accounts and Licenses,Accounts,Free,,thanks expires feature going left free use may...,"Tasks, Folders, Projects and Spaces",Advanced,Log In as Another User


In [243]:
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [343]:
for true, pred in zip(['section_1', 'section_2', 'section_3'],
                      ['section_1_pred', 'section_2_pred', 'section_3_pred']):
    le = preprocessing.LabelEncoder()
    le.fit(answer_df[true].tolist() + answer_df[pred].tolist())
    true_labels = le.transform(answer_df[true])
    pred_labels = le.transform(answer_df[pred])
    print(classification_report(true_labels, pred_labels, target_names=le.classes_))

                                     precision    recall  f1-score   support

                 Account Management       0.60      0.35      0.44        26
                               Apps       0.00      0.00      0.00         1
                       Integrations       0.77      0.53      0.62        19
                   Monitoring Panel       0.78      0.32      0.45        22
                           Security       0.00      0.00      0.00         2
Tasks, Folders, Projects and Spaces       0.54      0.94      0.69        70
     Types of Accounts and Licenses       0.00      0.00      0.00        10
                         Work Views       0.78      0.39      0.52        18

                           accuracy                           0.59       168
                          macro avg       0.43      0.32      0.34       168
                       weighted avg       0.59      0.59      0.54       168

                             precision    recall  f1-score   support

   

In [344]:
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields

In [345]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

In [346]:
Y = section_1_ohe.toarray()

columns = Y.shape[1]
for i in range(columns):
    print(section_1_encoder.categories_[0][i])
    y_train = Y[:, i]
    clf = LogisticRegression()
    clf.fit(TF_X.toarray(), y_train)
    display(eli5.show_weights(clf, vec=vectorizer, horizontal_layout=False))

Account Management


Weight?,Feature
+2.813,group
+2.811,account
+2.499,email
+2.057,admin
+1.912,profile
+1.775,situation
+1.597,support
+1.467,see
+1.464,setting
+1.332,scim


Apps


Weight?,Feature
+3.650,app
+2.619,tap
+1.563,bottom
+1.387,window
+1.274,open
+1.189,submit
+1.164,view
+1.040,offline
+0.943,desktop
+0.929,update


Integrations


Weight?,Feature
+3.102,wrike
+2.249,add
+2.216,sync
+2.215,integration
+2.094,google
+2.057,extension
+1.961,outlook
+1.769,import
+1.750,gmail
+1.720,slack


Monitoring Panel


Weight?,Feature
+3.891,report
+2.943,widget
+2.716,calendar
+2.545,workload
+2.271,dashboard
+2.163,board
+2.018,effort
+1.953,layer
+1.947,chart
+1.765,job


Security


Weight?,Feature
+2.441,password
+1.959,single
+1.689,verification
+1.609,sign
+1.459,important
+1.323,365
+1.288,office
+1.133,azure
+1.093,deactivate
+1.059,domain


Tasks, Folders, Projects and Spaces


Weight?,Feature
+1.906,form
+1.755,workflow
+1.680,request
+1.669,space
+1.543,comment
+1.523,field
+1.501,subtask
+1.456,project
+1.392,milestone
+1.387,guest


Types of Accounts and Licenses


Weight?,Feature
+1.921,feature
+1.837,
+1.581,license
+0.968,performance
+0.954,contact
+0.902,professional
+0.857,tracking
+0.819,work
+0.818,permission
+0.785,salesforce


Work Views


Weight?,Feature
+2.752,dependency
+2.746,chart
+2.687,gantt
+2.479,view
+2.020,table
+1.943,timelog
+1.783,date
+1.686,snapshot
+1.584,category
+1.411,sorting
