In [1]:
import pandas as pd
import nltk

In [2]:
from joblib import dump, load

## Data for validation

In [3]:
data = pd.read_csv('all_dataset.csv')
data = data.drop(columns='Unnamed: 0')

In [4]:
data.groupby(by='source').count()

Unnamed: 0_level_0,answer_text,questions,url_1,url_2,url_3,url_4,section_1,section_2,section_3,section_4,section_text,keywords,text_len
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
from_alex,0,0,1093,1093,1093,1093,1093,1093,1093,1090,1093,0,0
from_amir,0,0,1272,1272,1272,1272,1272,1272,1272,1269,1260,1261,1261
from_crawl_spider,168,168,168,168,168,27,168,168,168,27,0,0,0
from_dirty_questions,207,191,207,207,207,60,207,207,207,60,0,0,0
from_wrike_org,0,168,168,168,168,41,168,168,168,35,0,0,0


In [60]:
validate_df = data[(data['source'] == 'from_dirty_questions') & (data['questions'] != '')]

In [61]:
validate_df = validate_df[['questions', 'section_1', 
                           'section_2', 'section_3', 'section_4']].rename(columns={'questions':'text'})

In [6]:
generated = pd.read_csv('dataset_text_with_questions.csv')
generated = generated.drop(columns='Unnamed: 0')

In [7]:
generated.groupby(by='source').count()

Unnamed: 0_level_0,answer_text,questions,url_1,url_2,url_3,url_4,section_1,section_2,section_3,section_4,section_text,keywords,text_len,col_of_questions
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
from_alex,0,2797,2797,2797,2797,2797,2797,2797,2797,2791,2797,0,0,2797
from_amir,0,3562,3562,3562,3562,3562,3562,3562,3562,3551,3562,3562,3562,3562


In [8]:
from_alex = data[data['source'] == 'from_alex'][['section_text','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'section_text':'text'})
from_amir = data[data['source'] == 'from_amir'][['section_text','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'section_text':'text'})
from_crawl_spider = data[data['source'] == 'from_crawl_spider'][['questions','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'questions':'text'})
from_wrike_org = data[data['source'] == 'from_wrike_org'][['questions','section_1','section_2', 
                                                 'section_3', 'section_4']].rename(columns={'questions':'text'})
generated_train = generated[['questions','section_1','section_2', 
                             'section_3', 'section_4']].rename(columns={'questions':'text'})

In [9]:
train_data = pd.concat([from_alex, from_amir, from_crawl_spider, 
                        from_wrike_org, generated_train], 
                       ignore_index=True)

In [10]:
train_data.head()

Unnamed: 0,text,section_1,section_2,section_3,section_4
0,A personalized and up-to-date profile adds a p...,Team Member Guide (for account users),Team Member Guide (for account users),Step 2: Setting Up Your Workspace,Setting Up Your Profile
1,Update the account setting that will affect al...,Team Member Guide (for account users),Team Member Guide (for account users),Step 2: Setting Up Your Workspace,Basic Account Administration
2,Set up SpacesSpaces are hubs that store all wo...,Team Member Guide (for account users),Team Member Guide (for account users),Step 2: Setting Up Your Workspace,Organizing Work With Spaces and Folders
3,"Whether you’re launching a new product, writin...",Team Member Guide (for account users),Team Member Guide (for account users),Step 2: Setting Up Your Workspace,Launching New Projects
4,You’ve spent some time choosing the best solut...,Team Member Guide (for account users),Team Member Guide (for account users),Step 3: Team Onboarding and Collaboration,Team Onboarding


In [11]:
train_data.shape

(9060, 5)

In [12]:
information_section_list = [
       'Wrike and Bitbucket Sync: Overview',
       'Wrike and GitLab Sync: Overview',
       'Wrike and GitHub Sync: Overview',
       'Wrike and JIRA Sync: Overview',
       'Wrike and JIRA Sync: Setup Guide','Step 1: Learning the Fundamentals',
       'Step 2: Setting Up Your Workspace',
       'Step 3: Team Onboarding and Collaboration',
       'Step 4: Monitoring Work Progress',
       'Step 5: Tailoring Wrike to Your Team’s Needs',
       'Step 6. Making Remote Work Productive', 'Step 1: Getting Settled',
       'Step 2: Launching New Projects', 'Step 3: Collaborating in Wrike',
       'Step 4: Work from Home Productively']

In [13]:
train_data = train_data[~train_data['section_3'].isin(information_section_list)]
train_data = train_data[train_data['section_4'] != 'Overview']
train_data = train_data[train_data['text'] != '']

In [14]:
train_data.shape

(7112, 5)

In [15]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

In [16]:
from nltk.corpus import stopwords

english_stopwords = stopwords.words("english")

In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [18]:
#Preprocess function
def preprocess_text(text, tokenizer=RegexpTokenizer(r'\w+'), 
                    lemmatizer=WordNetLemmatizer(), stopwords=stopwords.words("english")):
    tokens = tokenizer.tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in english_stopwords\
              and token != " " ]
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    clean_text = " ".join(lemmas)
    return clean_text

In [19]:
train_data['text_cleaned'] = train_data['text'].apply(lambda x: preprocess_text(str(x)))

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
TF_X = vectorizer.fit_transform(train_data['text_cleaned'])

In [67]:
section_4_data = train_data.dropna(subset=['section_4'])
section_4_data = section_4_data[section_4_data['section_4'] != 'Important Information']
TF_X_4 = vectorizer.transform(section_4_data['text_cleaned'])

## Dumping vectorizer

In [22]:
dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

In [68]:
from sklearn.preprocessing import OneHotEncoder

section_1_encoder = OneHotEncoder(handle_unknown='ignore')
section_2_encoder = OneHotEncoder(handle_unknown='ignore')
section_3_encoder = OneHotEncoder(handle_unknown='ignore')
section_4_encoder = OneHotEncoder(handle_unknown='ignore')

section_1_ohe = section_1_encoder.fit_transform(train_data['section_1'].values.reshape(-1, 1))
section_2_ohe = section_2_encoder.fit_transform(train_data['section_2'].values.reshape(-1, 1))
section_3_ohe = section_3_encoder.fit_transform(train_data['section_3'].values.reshape(-1, 1))
section_4_ohe = section_4_encoder.fit_transform(section_4_data['section_4'].values.reshape(-1, 1))

## Dumping encoders

In [69]:
dump(section_1_encoder, 'section_1_encoder.joblib')
dump(section_2_encoder, 'section_2_encoder.joblib')
dump(section_3_encoder, 'section_3_encoder.joblib')
dump(section_4_encoder, 'section_4_encoder.joblib')

['section_4_encoder.joblib']

In [70]:
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

In [26]:
def predict_one_class(train_data, submit_data, y, model, n_splits=3, silent=False):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    to_check = pd.DataFrame(train_data.copy())
    to_check[f'predict_proba'] = 0
    pred = np.zeros(len(submit_data))
    y_arr = y
    for i, (train_index, valid_index) in enumerate(skf.split(train_data, y_arr)):        
        X_train = train_data[train_index]
        X_valid = train_data[valid_index]
        y_train = y_arr[train_index]
        y_valid = y_arr[valid_index] 
        
        model.fit(X_train, y_train)
        
        prediction_skf = model.predict_proba(submit_data)[:,1]
        pred += prediction_skf / skf.n_splits  
        
        prediction = model.predict_proba(X_valid)[:,1]
        to_check.loc[valid_index, f'predict_proba'] = prediction
                
    local_auc = roc_auc_score(y, to_check[f'predict_proba'])
    
    if not silent: 
        print(f"ROC_AUC_local: {local_auc}")
    
    return pred,to_check[f'predict_proba'],local_auc

In [27]:
import numpy as np

In [94]:
Y = section_1_ohe.toarray()
X_train, X_test, Y_train, Y_test = train_test_split(TF_X.toarray(), Y, test_size=0.2)

cv_scores = []
probas = []
cross_vals = []

columns = Y.shape[1]
for i in range(columns):
    print(section_1_encoder.categories_[0][i])
    y_train = Y_train[:, i]
    clf = LogisticRegression()
    proba, cross_val, cv_score = predict_one_class(X_train, X_test, y_train, clf, n_splits=3)  
    probas.append(proba)
    cross_vals.append(cross_val)
    cv_scores.append(cv_score)
print ()
print(f'TOTAL CV:{np.mean(cv_scores)}')

Account Management
ROC_AUC_local: 0.9289677334569808
Apps
ROC_AUC_local: 0.9534211693995682
Integrations
ROC_AUC_local: 0.9381296627540013
Monitoring Panel
ROC_AUC_local: 0.9408293472320739
Security
ROC_AUC_local: 0.9657055428547798
Tasks, Folders, Projects and Spaces
ROC_AUC_local: 0.9156215633882305
Types of Accounts and Licenses
ROC_AUC_local: 0.8967225110838859
Work Views
ROC_AUC_local: 0.9339772307851445

TOTAL CV:0.9341718451193332


In [95]:
import warnings
warnings.filterwarnings("ignore")

In [96]:
Y = section_2_ohe.toarray()

X_train, X_test, Y_train, Y_test = train_test_split(TF_X.toarray(), Y, test_size=0.2)

cv_scores = []
probas = []
cross_vals = []

columns = Y.shape[1]
for i in range(columns):
    print(section_2_encoder.categories_[0][i])
    y_train = Y_train[:, i]
    clf = LogisticRegression()
    try:
        proba, cross_val, cv_score = predict_one_class(X_train, X_test, y_train, clf, n_splits=3)
    except:
        pass
    probas.append(proba)
    cross_vals.append(cross_val)
    cv_scores.append(cv_score)
print ()
print(f'TOTAL CV:{np.mean(cv_scores)}')

Accounts
ROC_AUC_local: 0.8836409479048439
Add-ins and Extensions
ROC_AUC_local: 0.9539016038925934
Advanced
ROC_AUC_local: 0.9359112681521692
All the Other Views
ROC_AUC_local: 0.940168140626706
Automated Monitoring
ROC_AUC_local: 0.9858601866525797
Billing
ROC_AUC_local: 0.9452006659297771
Calendars
ROC_AUC_local: 0.9488030813407698
Communication
ROC_AUC_local: 0.9387662701463173
Desktop
ROC_AUC_local: 0.9415550615741677
Email
ROC_AUC_local: 0.9475111005948662
Everything Else
ROC_AUC_local: 0.9391929561872296
Folders and Projects
ROC_AUC_local: 0.9303118703997342
Gantt Chart
ROC_AUC_local: 0.9521430436363749
General Account Management
ROC_AUC_local: 0.9318799222559482
Import and Export
ROC_AUC_local: 0.9484758915878848
Licenses
ROC_AUC_local: 0.9120445960765201
Mobile
ROC_AUC_local: 0.9536533055096658
More Monitoring Panel Views
ROC_AUC_local: 0.9582557032910899
More Security Features
ROC_AUC_local: 0.9654067262164272
Organization
ROC_AUC_local: 0.9241027314425616
Personal User Licen

In [98]:
from tqdm import tqdm

In [None]:
Y = section_3_ohe.toarray()

X_train, X_test, Y_train, Y_test = train_test_split(TF_X.toarray(), Y, test_size=0.2)

cv_scores = []
probas = []
cross_vals = []

columns = Y.shape[1]
for i in tqdm(range(columns)):
    y_train = Y_train[:, i]
    clf = LogisticRegression()
    try:
        proba, cross_val, cv_score = predict_one_class(X_train, X_test, y_train, clf, n_splits=3, silent=True)  
    except ValueError: ## ошибка недостаточного кол-ва примеров
        pass
    probas.append(proba)
    cross_vals.append(cross_val)
    cv_scores.append(cv_score)
print ()
print(f'TOTAL CV:{np.mean(cv_scores)}')

In [None]:
Y = section_4_ohe.toarray()

X_train, X_test, Y_train, Y_test = train_test_split(TF_X_4.toarray(), Y, test_size=0.2)

cv_scores = []
probas = []
cross_vals = []

columns = Y.shape[1]
for i in tqdm(range(columns)):
    y_train = Y_train[:, i]
    clf = LogisticRegression()
    try:
        proba, cross_val, cv_score = predict_one_class(X_train, X_test, y_train, clf, n_splits=3, silent=True)  
    except ValueError: ## ошибка недостаточного кол-ва примеров
        pass
    probas.append(proba)
    cross_vals.append(cross_val)
    cv_scores.append(cv_score)
print ()
print(f'TOTAL CV:{np.mean(cv_scores)}')

In [28]:
Y = section_1_ohe.toarray()
base_lr = LogisticRegression()
section_1_ovr = OneVsRestClassifier(base_lr)
section_1_ovr.fit(TF_X.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [29]:
Y = section_2_ohe.toarray()
base_lr = LogisticRegression()
section_2_ovr = OneVsRestClassifier(base_lr)
section_2_ovr.fit(TF_X.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [30]:
Y = section_3_ohe.toarray()
base_lr = LogisticRegression()
section_3_ovr = OneVsRestClassifier(base_lr)
section_3_ovr.fit(TF_X.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [71]:
Y = section_4_ohe.toarray()
base_lr = LogisticRegression()
section_4_ovr = OneVsRestClassifier(base_lr)
section_4_ovr.fit(TF_X_4.toarray(), Y)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

## Dumping models

In [72]:
dump(section_1_ovr, 'section_1_ovr.joblib')
dump(section_2_ovr, 'section_2_ovr.joblib')
dump(section_3_ovr, 'section_3_ovr.joblib')
dump(section_4_ovr, 'section_4_ovr.joblib')

['section_4_ovr.joblib']

In [73]:
def predict_question_classes(text, vectorizer, encoders=[], models=[]):
    probable_classes = []
    clean_text = preprocess_text(text)
    X = [clean_text]
    vectorized = vectorizer.transform(X)
    for encoder, model in zip(encoders, models):
        prediction = model.predict_proba(vectorized)[0]
        categories = encoder.categories_[0]
        answer = sorted(zip(prediction, categories), reverse=True)[0]
        probable_classes.append(f'{answer[1]}: {answer[0]}')
    return probable_classes

In [76]:
predict_question_classes('how to export a excel', vectorizer=vectorizer, 
                         encoders=[section_1_encoder, section_2_encoder, section_3_encoder, section_4_encoder], 
                         models=[section_1_ovr, section_2_ovr, section_3_ovr, section_4_ovr])

['Integrations: 0.57606976533149',
 'Import and Export: 0.5814191057955604',
 'Import Data From Excel: 0.050189920236327706',
 'Export an Analytics Board: 0.015005520376626962']

In [77]:
def get_validate_results(df, vectorizer, encoders=[], models=[], sections=[]):
    data = df.reset_index(drop=True)
    data['clean_text'] = data['text'].apply(lambda x: preprocess_text(str(x)))
    X = vectorizer.transform(data['clean_text'])
    for i, row in enumerate(X):
        for encoder, model, section in zip(encoders, models, sections):
            prediction = model.predict_proba(row)[0]
            categories = encoder.categories_[0]
            answer = sorted(zip(prediction, categories), reverse=True)[0]
            data.at[i, section] = answer[1]
    return data

In [78]:
answer_df = get_validate_results(validate_df, vectorizer=vectorizer, 
                         encoders=[section_1_encoder, section_2_encoder, section_3_encoder, section_4_encoder], 
                         models=[section_1_ovr, section_2_ovr, section_3_ovr, section_4_ovr],
                         sections=['section_1_pred', 'section_2_pred', 'section_3_pred', 'section_4_pred'])

In [79]:
answer_df.head()

Unnamed: 0,text,section_1,section_2,section_3,section_4,clean_text,section_1_pred,section_2_pred,section_3_pred,section_4_pred
0,user profile colourhi little disappointed ever...,Account Management,General Account Management,User Groups,Edit a Group's Avatar,user profile colourhi little disappointed ever...,"Tasks, Folders, Projects and Spaces",General Account Management,Microsoft Teams,More Info
1,deferred task set become active specified date...,"Tasks, Folders, Projects and Spaces",Advanced,Custom Statuses and Workflows,,deferred task set become active specified date...,"Tasks, Folders, Projects and Spaces",Tasks,Custom Statuses and Workflows,Create a Task
2,timeline change colour barcolour code timeline...,"Tasks, Folders, Projects and Spaces",Communication,Permalink,,timeline change colour barcolour code timeline...,"Tasks, Folders, Projects and Spaces",Communication,Gantt Chart Overview,More Info
3,comment image uploads getting buried task subt...,"Tasks, Folders, Projects and Spaces",Proofing and Approvals,Proofing,,comment image uploads getting buried task subt...,"Tasks, Folders, Projects and Spaces",Communication,Comments,More Info
4,set task start day another tasktrying set temp...,Work Views,Gantt Chart,Dependencies on the Gantt Chart,,set task start day another tasktrying set temp...,"Tasks, Folders, Projects and Spaces",Tasks,Tasks,Create a Task


In [80]:
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [93]:
for true, pred in zip(['section_1', 'section_2', 'section_3'],
                      ['section_1_pred', 'section_2_pred', 'section_3_pred']):
    le = preprocessing.LabelEncoder()
    le.fit(answer_df[true].tolist() + answer_df[pred].tolist())
    true_labels = le.transform(answer_df[true])
    pred_labels = le.transform(answer_df[pred])
    print(classification_report(true_labels, pred_labels, target_names=le.classes_))
    #le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
    #print(classification_report(true_labels, answer_df[pred].apply(lambda x: le_dict.get(x, '<unknown_value>')), 
    #                            target_names=list(le.classes_+'<unknown_value>')))

                                     precision    recall  f1-score   support

                 Account Management       0.60      0.40      0.48        30
                               Apps       1.00      0.60      0.75         5
                       Integrations       0.71      0.56      0.63        18
                   Monitoring Panel       0.64      0.50      0.56        32
                           Security       0.50      1.00      0.67         2
Tasks, Folders, Projects and Spaces       0.57      0.85      0.68        89
     Types of Accounts and Licenses       0.00      0.00      0.00        16
                         Work Views       0.57      0.27      0.36        15

                           accuracy                           0.59       207
                          macro avg       0.57      0.52      0.52       207
                       weighted avg       0.56      0.59      0.55       207

                               precision    recall  f1-score   support

 

In [None]:
accuracy_score()