<h2>Conditional Random Fields (CRF) model for Smartphones offer titles</h2>

In [5]:
import pandas as pd
import time

from sklearn import model_selection
from joblib import dump
from sklearn_crfsuite import metrics
import sklearn_crfsuite

In [9]:
BIO_ENCODED_PRODUCT_TITLES_PATHFILE = "../_data/bio_encoded_product_titles.csv"

def get_bio_encoded_titles(file_path):
    '''
    Returns the features for the product titles (i.e. the words) as a list of 
    lists of strings and the labels for these features as a list of lists of strings.
    '''
    features = []
    labels = []
    classes = []

    bio_titles_df = pd.read_csv(BIO_ENCODED_PRODUCT_TITLES_PATHFILE, encoding='iso-8859-1')
    classes = bio_titles_df["BIOTag"].unique()

    for titleNum in bio_titles_df["TitleNumber"].unique():
        title_features = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "Word"].tolist()
        title_labels = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "BIOTag"].tolist()

        features.append(title_features)
        labels.append(title_labels)

    return features, labels, classes

In [11]:
start_time = time.time()
bio_encoded_titles = get_bio_encoded_titles(BIO_ENCODED_PRODUCT_TITLES_PATHFILE)
elapsed_time = round(time.time() - start_time, 3)

print("BIO-encoded titles collected. Elapsed time (s): {}".format(elapsed_time))
print("Number of BIO-encoded titles collected: {}\n".format(len(bio_encoded_titles[0])))

BIO-encoded titles collected. Elapsed time (s): 150.218
Number of BIO-encoded titles collected: 50000



(['leifheit', 'kleidersack', 'lang', 'farbe', 'schwarz'],
 ['O', 'O', 'O', 'O', 'B-COLOR'],
 'O')

In [19]:
# Show and example of a BIO-encoded product title by BIOTagger
i = 3100
print("BIO-encoded title i = {} ".format(i))
print("Features = {}".format(str(bio_encoded_titles[0][i])))
print("Labels = {}".format(str(bio_encoded_titles[1][i])))

BIO-encoded title i = 3100 
Features = ['apple', 'iphone', '6s', 'smartphone', 'de', '47', '128', 'gb', 'plata']
Labels = ['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'O', 'O', 'O', 'B-COLOR']


In [20]:
features = bio_encoded_titles[0]
labels = bio_encoded_titles[1]
classes = bio_encoded_titles[2]

X_train, X_test, y_train, y_test = model_selection.train_test_split(features, labels, test_size=0.30, random_state=0)

In [21]:
crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.01,
        max_iterations=200,
        all_possible_transitions=True
    )

start_time = time.time()
crf.fit(X_train, y_train)
elapsed_time = round(time.time() - start_time, 3)

print("CRF model training finished. Elapsed time (s): {}".format(elapsed_time))

y_pred = crf.predict(X_test)

print(metrics.flat_classification_report(y_test, y_pred, labels = classes))

CRF model training finished. Elapsed time (s): 34.17
              precision    recall  f1-score   support

           O       0.91      0.94      0.93    114432
     B-COLOR       0.84      0.70      0.77     11909
     B-BRAND       0.90      0.86      0.88     11291
     B-MODEL       0.82      0.78      0.80     10218
     I-MODEL       0.73      0.74      0.74      8161
       B-RAM       0.59      0.31      0.41      1365
       I-RAM       0.60      0.46      0.52       925

   micro avg       0.89      0.89      0.89    158301
   macro avg       0.77      0.69      0.72    158301
weighted avg       0.89      0.89      0.89    158301



In [27]:
# Predict some examples
sample_titles = [
    "APPLE IPHONE XR 4G 64GB BLUE",
    "Huawei Mate 20 Lite",
    "Huawei P20",
    "Samsung Galaxy SII 64GB 4GB RAM",
    "ZTE Blade V9 Smartphone (14,5cm (5,7 Zoll) Display, 32 GB interner Speicher, Android) Schwarz"
]

for title in sample_titles:
    splitted_title = title.split()
    # TODO preprocess title
    labels = crf.predict([splitted_title])
    print(splitted_title)
    print(labels)
    print()

['APPLE', 'IPHONE', 'XR', '4G', '64GB', 'BLUE']
[['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL', 'I-MODEL', 'I-MODEL']]

['Huawei', 'Mate', '20', 'Lite']
[['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL']]

['Huawei', 'P20']
[['B-BRAND', 'B-COLOR']]

['Samsung', 'Galaxy', 'SII', '64GB', '4GB', 'RAM']
[['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL', 'I-MODEL', 'I-MODEL']]

['ZTE', 'Blade', 'V9', 'Smartphone', '(14,5cm', '(5,7', 'Zoll)', 'Display,', '32', 'GB', 'interner', 'Speicher,', 'Android)', 'Schwarz']
[['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-COLOR']]



In [26]:
CRF_MODEL_OUTPUT_FILE = "../_models/crf.joblib"

# Dump CRF model to file
dump(crf, CRF_MODEL_OUTPUT_FILE)

['../_models/crf.joblib']