In [1]:
import pandas as pd
import fastText
from fastText import train_supervised

In [2]:
def simplify(string):
    """Input: String. Output: Simplified sting (lowercase, no special characters, ...)."""
    string = string.lower()
    string = re.sub("[(\xa0)(\n)]", " ", string) # replace non-breaking space and newline
    string = re.sub("\W+", " ", string) # replace any non-alphanumeric character
    string = re.sub("\d+", "0", string) # replace all digits with 0
    string = re.sub(" +", " ", string) # replace multiple spaces 
    return string


def create_fasttext_data(path, title_file, body_file, category_file):
    """Reads specified csv files and experts a merged pandas dataframe with fastText label."""
    
    title = pd.read_csv(path + title_file, header=None, names=['id', 'title'], index_col=0)
    body = pd.read_csv(path + body_file, header=None, names=['id', 'body'], index_col=0)
    category = pd.read_csv(path + category_file, header=None, names=['id', 'category'], index_col=0)

    # Create fastText label format
    for i in category.index:
        category.at[i, 'label'] = '__label__' + str(category.at[i, 'category']) 

    # Merge dataframes    
    text = pd.DataFrame(pd.merge(title, body, on='id'))
    data = pd.DataFrame(pd.merge(category, text, on='id'))
    
    export = data['label'] + ' ' + data['title'] + ' ' + data['body']
    
    return export

In [3]:
# Read data from csv and create merged pandas datraframe

path = %pwd # "C:\\Git_repo\\ai-for-it-support-management\\aicore\\aicore"
train_path = path + "\\datasets\\train\\"
validation_path = path + "\\datasets\\dev\\"
test_path = path + "\\datasets\\test\\"

train_files = {'title_file': 'train_data_title_endava.csv', 'body_file': 'train_data_body_endava.csv', 
             'category_file': 'train_data_category_endava.csv'}

validation_files = {'title_file': 'dev_data_title_endava.csv', 'body_file': 'dev_data_body_endava.csv', 
             'category_file': 'dev_data_category_endava.csv'}

test_files = {'title_file': 'test_data_title_endava.csv', 'body_file': 'test_data_body_endava.csv', 
             'category_file': 'test_data_category_endava.csv'}

train_data = create_fasttext_data(train_path, **train_files)
validation_data = create_fasttext_data(validation_path, **validation_files)
test_data = create_fasttext_data(test_path, **test_files)

train_data

id
36838    __label__4 oracle issue issue hi guys tasks li...
18212    __label__4 access wednesday hi please order ta...
38970    __label__4 unable to approve after being added...
30489    __label__4 am interns place assignment thursda...
27278    __label__5 to adapter november adapter hi adap...
40942    __label__4 approval flow issue sent request fo...
29699    __label__4 new starter wednesday pm date hello...
39588    __label__4 not generating service now hello co...
7653     __label__4 wants to access status report wants...
42550    __label__4 error sent wednesday re error check...
44856    __label__4 can change password on sent monday ...
40510    __label__6 urgent server is down sent friday j...
27850    __label__4 add user friday october pm add user...
1090     __label__4 list of servers hello these listed ...
23663    __label__4 mick small leaver should not have f...
34563    __label__4 access to july please member gain t...
22608    __label__4 wants to access sales wednesday p

In [4]:
# Export to csv
train_file = "fasttext_train.csv"
validation_file = "fasttext_validation.csv"
test_file = "fasttext_test.csv"

train_data.to_csv(train_path + train_file, index=False)
validation_data.to_csv(validation_path + validation_file, index=False)
test_data.to_csv(test_path + test_file, index=False)

In [5]:
# Build and test model using fastText.
# Options: https://fasttext.cc/docs/en/options.html
classifier = train_supervised(input=train_path + train_file, epoch=50, lr=0.5, wordNgrams=5, dim=100)

In [6]:
# Validate
result = classifier.test(validation_path + validation_file)

F1 = 2 * result[1] * result[2] / (result[1] + result[2])
print("Test samples \t Precision \t Recall \t F1 \n", result[0], result[1], result[2], F1)

Test samples 	 Precision 	 Recall 	 F1 
 4779 0.8704749947687801 0.8704749947687801 0.8704749947687801


In [7]:
# Save or load model
classifier.save_model(path + "fasttext_model.bin")
# classifier = fastText.load_model(path + "fasttext_model.bin")

In [8]:
# Test model
result = classifier.test(test_path + test_file)

F1 = 2 * result[1] * result[2] / (result[1] + result[2])
print("Test samples \t Precision \t Recall \t F1 \n", result[0], result[1], result[2], F1)

Test samples 	 Precision 	 Recall 	 F1 
 4781 0.8736665969462456 0.8736665969462456 0.8736665969462456
