<h2>Conditional Random Fields (CRF) model for Smartphones offer titles</h2>

In [9]:
import pandas as pd
import time

from sklearn import model_selection
from joblib import dump
from sklearn_crfsuite import metrics
import sklearn_crfsuite

In [10]:
BIO_ENCODED_PRODUCT_TITLES_PATHFILE = "../_data/bio_encoded_product_titles.csv"

def get_bio_encoded_titles(file_path):
    '''
    Returns the features for the product titles (i.e. the words) as a list of 
    lists of strings and the labels for these features as a list of lists of strings.
    '''
    features = []
    labels = []
    classes = []

    bio_titles_df = pd.read_csv(BIO_ENCODED_PRODUCT_TITLES_PATHFILE, encoding='iso-8859-1')
    classes = bio_titles_df["BIOTag"].unique()

    for titleNum in bio_titles_df["TitleNumber"].unique():
        title_features = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "Word"].tolist()
        title_labels = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "BIOTag"].tolist()

        features.append(title_features)
        labels.append(title_labels)

    return features, labels, classes

In [11]:
start_time = time.time()
bio_encoded_titles = get_bio_encoded_titles(BIO_ENCODED_PRODUCT_TITLES_PATHFILE)
elapsed_time = round(time.time() - start_time, 3)

print("BIO-encoded titles collected. Elapsed time (s): {}".format(elapsed_time))
print("Number of BIO-encoded titles collected: {}\n".format(len(bio_encoded_titles[0])))

BIO-encoded titles collected. Elapsed time (s): 174.471
Number of BIO-encoded titles collected: 57535



In [12]:
# Show and example of a BIO-encoded product title by BIOTagger
i = 3100
print("BIO-encoded title i = {} ".format(i))
print("Features = {}".format(str(bio_encoded_titles[0][i])))
print("Labels = {}".format(str(bio_encoded_titles[1][i])))

BIO-encoded title i = 3100 
Features = ['samsung', 'galaxy', 's7', 'edge', 'smartphone', '55', 'zoll', '139', 'cm', '32gb', 'interner', 'speicher']
Labels = ['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [13]:
features = bio_encoded_titles[0]
labels = bio_encoded_titles[1]
classes = bio_encoded_titles[2]

X_train, X_test, y_train, y_test = model_selection.train_test_split(features, labels, test_size=0.30, random_state=0)

In [14]:
crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.01,
        max_iterations=200,
        all_possible_transitions=True
    )

start_time = time.time()
crf.fit(X_train, y_train)
elapsed_time = round(time.time() - start_time, 3)

print("CRF model training finished. Elapsed time (s): {}".format(elapsed_time))

y_pred = crf.predict(X_test)

CRF model training finished. Elapsed time (s): 36.581


<h2>Evaluation</h2>

In [16]:
# Remove 'O'-tagged words, as most of the words will be 'O'-tagged
# and the results will look much better than what they actually are
labels = list(crf.classes_)
labels.remove('O')

print(metrics.flat_classification_report(y_test, y_pred, labels = labels))

              precision    recall  f1-score   support

     B-BRAND       0.98      0.95      0.96     17605
     B-MODEL       0.94      0.90      0.92     17261
     B-COLOR       0.83      0.73      0.78     14514
     I-MODEL       0.81      0.67      0.73      9249
       B-RAM       0.00      0.00      0.00       843
       I-RAM       0.00      0.00      0.00       520

   micro avg       0.91      0.82      0.86     59992
   macro avg       0.59      0.54      0.57     59992
weighted avg       0.88      0.82      0.85     59992



In [17]:
# Predict some examples
sample_titles = [
    "APPLE IPHONE XR 4G 64GB BLUE",
    "Huawei Mate 20 Lite",
    "Huawei P20",
    "Samsung Galaxy SII 64GB 4GB RAM",
    "ZTE Blade V9 Smartphone (14,5cm (5,7 Zoll) Display, 32 GB interner Speicher, Android) Schwarz"
]

for title in sample_titles:
    splitted_title = title.split()
    # TODO preprocess title
    labels = crf.predict([splitted_title])
    print(splitted_title)
    print(labels)
    print()

['APPLE', 'IPHONE', 'XR', '4G', '64GB', 'BLUE']
[['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'O']]

['Huawei', 'Mate', '20', 'Lite']
[['B-BRAND', 'B-MODEL', 'B-COLOR', 'O']]

['Huawei', 'P20']
[['B-BRAND', 'B-COLOR']]

['Samsung', 'Galaxy', 'SII', '64GB', '4GB', 'RAM']
[['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'O']]

['ZTE', 'Blade', 'V9', 'Smartphone', '(14,5cm', '(5,7', 'Zoll)', 'Display,', '32', 'GB', 'interner', 'Speicher,', 'Android)', 'Schwarz']
[['B-BRAND', 'B-MODEL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-COLOR']]



In [26]:
CRF_MODEL_OUTPUT_FILE = "../_models/crf.joblib"

# Dump CRF model to file
dump(crf, CRF_MODEL_OUTPUT_FILE)

['../_models/crf.joblib']