In [2]:
import pandas as pd
import numpy as np
import pickle
import os

pd.options.display.max_colwidth=50
pd.options.display.max_columns=300


import string
import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity

punctuations = string.punctuation

#nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

In [3]:
def check_path(path):
    os.system("if [ ! -d " + path + " ]; then mkdir -p " + path + "; fi")

def clear_text(text):
    text = text.lower()
    text = re.sub("\n|\r|\t", " ", text)
    text = re.sub("[^\w\s\.]", " ", text)
    text = re.sub(" +"," ", text.strip())
    return text 

def weighted_accuracy(true, predicted):
    weights=[1,1,2]
    sum_wI=0
    sum_w=0
    for t, p in zip(true, predicted):
        sum_wI+= weights[t] * (t == p)
        sum_w+= weights[t]
    return sum_wI/sum_w

def make_one_text(texts):
    final_text = ""
    
    for text in texts:
        final_text += clear_text(text) + " "
    
    return final_text.strip()

def get_embed_df(train, column_name):
    embed_text = []
    for row in train[column_name].values:
        embed_text.append(list(row))

    embed_af_col = [column_name+"_emb_"+str(i) for i in range(len(embed_text[0]))]
    embed_one_text_df = pd.DataFrame(embed_text, columns = embed_af_col)
    return embed_one_text_df

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations and not word.isdigit() and len(word)>2]
    return mytokens

def make_list_of_texts(texts):
    texts='\n'.join(texts)
    texts = re.sub("\.|\!", "\n", texts)
    sentences=[spacy_tokenizer(clear_text(sen)) for sen in texts.split('\n')]
    return [' '.join(sen) for sen in sentences if len(sen)>3]

def combine_sentences(list_of_sent):
    result = " "
    for sent in list_of_sent:
        result += sent +". "
    return result.strip()
    
def replace_nan(text):
    return re.sub("_nan_", "", text)

In [10]:
with open('data/train_df.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('data/test_df.pkl', 'rb') as f:
    test = pickle.load(f)

In [11]:
train["clear_text"] = train["text"].apply(make_list_of_texts)
test["clear_text"] = test["text"].apply(make_list_of_texts)

In [12]:
train.head(2)

Unnamed: 0,id,html,text,keywords,accepted_function,rejected_function,accepted_product,rejected_product,target,clear_text
0,b811ea46-1bc5-4a18-a64c-7504c382da44,"[b'<!DOCTYPE html>\n<html lang=""en"" class=""pre...",[Release\n2020\n2019\n2018\n2017\n2020\n2019\n...,"[tools, services, projects, incubates, investm...","accounting, bookeeping, auditing, finance, HR,...","manufacture production processing assembly, re...",,,2,"[net asset value nav, annual general meeting s..."
1,b5e3f6d4-30bc-443c-a9d6-8ed41d5c33b3,"[b'<!DOCTYPE html>\n<html lang=""en-US"">\n <he...",[SAP Integrated Business Planning (IBP)\nSAP A...,"[budgeting, decision, cycles, closing, Simplif...","IT consultancy, IT support services, IT manage...","distribution, sale, trade, wholesale, retail s...",,,2,"[sap integrated business planning ibp, sap adv..."


In [13]:
train["text"] = train.clear_text.apply(combine_sentences)
test["text"] = test.clear_text.apply(combine_sentences)

train["keywords"] = train_clear["keywords"].apply(lambda x: ", ".join(x))
test["keywords"] = test_clear["keywords"].apply(lambda x: ", ".join(x))

In [14]:
new_train = train[["text", "accepted_function", "rejected_function", "accepted_product", "rejected_product",'keywords',"target"]]
new_test = test_clear[["text", "accepted_function", "rejected_function", "accepted_product", 'keywords',"rejected_product"]]

In [42]:
for col in new_train.columns[:-1]:
    new_train[col] = new_train[col].apply(replace_nan)
    
for col in new_test.columns:
    new_test[col] = new_test[col].apply(replace_nan)

In [29]:
with open('data/train_only_text.pickle', 'wb') as f:
    pickle.dump(new_train, f)
    
with open('data/test_only_text.pickle', 'wb') as f:
    pickle.dump(new_test, f)
    
# with open('dumps/train_only_text.pickle', 'rb') as f:
#     new_train = pickle.load(f)
    
# with open('dumps/test_only_text.pickle', 'rb') as f:
#     new_test = pickle.load(f)

In [43]:
new_train.head(4)

Unnamed: 0,text,accepted_function,rejected_function,accepted_product,rejected_product,keywords,target
0,net asset value nav. annual general meeting sh...,"accounting, bookeeping, auditing, finance, HR,...","manufacture production processing assembly, re...",,,"tools, services, projects, incubates, investme...",2
1,sap integrated business planning ibp. sap adva...,"IT consultancy, IT support services, IT manage...","distribution, sale, trade, wholesale, retail s...",,,"budgeting, decision, cycles, closing, Simplifi...",2
2,ams media group fully accredited independent m...,"general accounting, auditing, book keeping act...","manufacture, production, processing, distribut...",,,,1
3,specialist independent business travel agent. ...,"insurance brokerage, insurance agents, travel ...","software development, wholesale, retail sale, ...",,,"provide, service, financial, arranging, travel...",2


## Create product dataset for training BERT

In [126]:
fin_columns = ["text", "criteria", "score"]
final_df = pd.DataFrame(columns=["text", "criteria", "score"])

sub = new_train[["text", "accepted_product", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x == 2).astype("int")
final_df = final_df.append(sub)

sub = new_train[["text", "rejected_product", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x < 2).astype("int")
final_df = final_df.append(sub)

final_df.to_csv("data/product_df.csv", index=False)

In [98]:
final_df.head(2)

Unnamed: 0,text,criteria,score
4,amari ireland largest independent metal stockh...,"product, motor vehicles, buses and trucks, mot...",0
5,davies chemists established john glynne davies...,"product, pharmaceutical products medicines dru...",1


## Create function dataset for training BERT

In [99]:
fin_columns = ["text", "criteria", "score"]
final_df = pd.DataFrame(columns=["text", "criteria", "score"])

sub = new_train[["text", "accepted_function", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x == 2).astype("int")
final_df = final_df.append(sub)

sub = new_train[["text", "rejected_function", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x < 2).astype("int")
final_df = final_df.append(sub)

final_df.to_csv("data/function_df.csv", index=False)

## Same datasets for keywords

In [94]:
fin_columns = ["text", "criteria", "score"]
final_df = pd.DataFrame(columns=["text", "criteria", "score"])

sub = new_train[["keywords", "accepted_product", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x == 2).astype("int")
final_df = final_df.append(sub)

sub = new_train[["keywords", "rejected_product", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x < 2).astype("int")
final_df = final_df.append(sub)

final_df.to_csv("data/key_product_df.csv", index=False)

In [144]:
fin_columns = ["text", "criteria", "score"]
final_df = pd.DataFrame(columns=["text", "criteria", "score"])

sub = new_train[["keywords", "accepted_function", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x == 2).astype("int")
final_df = final_df.append(sub)

sub = new_train[["keywords", "rejected_function", "target"]]
sub.columns = fin_columns
sub = sub[sub.criteria.str.len() > 0]
sub = sub[sub.text.str.len() > 0]
sub.score = sub.score.apply(lambda x: x < 2).astype("int")
final_df = final_df.append(sub)

final_df.to_csv("data/key_function_df.csv", index=False)

In [96]:
product_df = pd.read_csv("data/product_df.csv")
product_df.head(2)

Unnamed: 0,text,criteria,score
0,amari ireland largest independent metal stockh...,"product, motor vehicles, buses and trucks, mot...",0
1,davies chemists established john glynne davies...,"product, pharmaceutical products medicines dru...",1


In [88]:
function_df = pd.read_csv("data/function_df.csv")
function_df.head(2)

Unnamed: 0,text,criteria,score
0,net asset value nav. annual general meeting sh...,"accounting, bookeeping, auditing, finance, HR,...",1
1,sap integrated business planning ibp. sap adva...,"IT consultancy, IT support services, IT manage...",1
