<h2>Conditional Random Fields (CRF) model for Smartphones offer titles</h2>

In [1]:
import pandas as pd
import time

from sklearn import model_selection, metrics
from joblib import dump
from sklearn_crfsuite import metrics
import sklearn_crfsuite
import scipy
import sklearn

In [2]:
BIO_ENCODED_PRODUCT_TITLES_PATHFILE = "../_data/bio_encoded_product_titles.csv"

def get_bio_encoded_titles(file_path):
    '''
    Returns the features for the product titles (i.e. the words) as a list of 
    lists of strings and the labels for these features as a list of lists of strings.
    '''
    features = []
    labels = []
    classes = []

    bio_titles_df = pd.read_csv(BIO_ENCODED_PRODUCT_TITLES_PATHFILE, encoding='iso-8859-1')
    classes = bio_titles_df["BIOTag"].unique()

    for titleNum in bio_titles_df["TitleNumber"].unique():
        title_features = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "Word"].tolist()
        title_labels = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "BIOTag"].tolist()

        features.append(title_features)
        labels.append(title_labels)

    return features, labels, classes

In [3]:
start_time = time.time()
bio_encoded_titles = get_bio_encoded_titles(BIO_ENCODED_PRODUCT_TITLES_PATHFILE)
elapsed_time = round(time.time() - start_time, 3)

print("BIO-encoded titles collected. Elapsed time (s): {}".format(elapsed_time))
print("Number of BIO-encoded titles collected: {}\n".format(len(bio_encoded_titles[0])))

BIO-encoded titles collected. Elapsed time (s): 196.054
Number of BIO-encoded titles collected: 57535



In [4]:
# Show and example of a BIO-encoded product title by BIOTagger
i = 3100
print("BIO-encoded title i = {} ".format(i))
print("Features = {}".format(str(bio_encoded_titles[0][i])))
print("Labels = {}".format(str(bio_encoded_titles[1][i])))

BIO-encoded title i = 3100 
Features = ['samsung', 'galaxy', 's7', 'edge', 'smartphone', '55', 'zoll', '139', 'cm', '32gb', 'interner', 'speicher']
Labels = ['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [46]:
def word2features(product_title, i):
    '''
    A product title is received as a list of words (i.e. strings).
    '''
    word = product_title[i]

    features = {
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isalpha()': word.isalpha(),
        'word.containsdigit()': any(char.isdigit() for char in word),
        'word.containsNonAlphanumericChars()': not word.isalnum(),
    }
    
    # The word is not the beggining of a product title
    if i > 0:
        preceding_word = product_title[i-1]
        features.update({
            '-1:word.lower()': preceding_word.lower(),
            '-1:word.istitle()': preceding_word.istitle(),
            '-1:word.isupper()': preceding_word.isupper(),
        })
        
    # The word is the beginning of a product title
    else:
        features['BOT'] = True

    # The word is not the end of a product title
    if i < len(product_title) - 1:
        subsequent_word = product_title[i+1]
        features.update({
            '+1:word.lower()': subsequent_word.lower(),
            '+1:word.istitle()': subsequent_word.istitle(),
            '+1:word.isupper()': subsequent_word.isupper(),
        })
        
    # The word is not the end of a product title
    else:
        features['EOT'] = True

    return features


def title2features(product_title):
    return [word2features(product_title, i) for i in range(len(product_title))]

In [47]:
features = [title2features(title) for title in bio_encoded_titles[0]]
labels = bio_encoded_titles[1]
classes = bio_encoded_titles[2]

X_train, X_test, y_train, y_test = model_selection.train_test_split(features, labels, test_size=0.30, random_state=0)

In [48]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.15,
    max_iterations=200,
    all_possible_transitions=True
)

start_time = time.time()
crf.fit(X_train, y_train)
elapsed_time = round(time.time() - start_time, 3)

print("CRF model training finished. Elapsed time (s): {}".format(elapsed_time))

y_pred = crf.predict(X_test)

CRF model training finished. Elapsed time (s): 65.242


<h2>Evaluation</h2>

In [49]:
# Remove 'O'-tagged words, as most of the words will be 'O'-tagged
# and the results will look much better than what they actually are
labels = list(crf.classes_)
labels.remove('O')

print(metrics.flat_classification_report(y_test, y_pred, labels = labels))

              precision    recall  f1-score   support

     B-BRAND       1.00      1.00      1.00     17605
     B-MODEL       1.00      1.00      1.00     17261
     B-COLOR       1.00      1.00      1.00     14514
     I-MODEL       1.00      1.00      1.00      9249
       B-RAM       0.97      0.98      0.98       843
       I-RAM       0.96      0.98      0.97       520

   micro avg       1.00      1.00      1.00     59992
   macro avg       0.99      0.99      0.99     59992
weighted avg       1.00      1.00      1.00     59992



In [50]:
# Predict some examples
sample_titles = [
    "Apple iPhone 4s 4G 64GB BLUE",
    "Huawei Mate 20 Lite",
    "Huawei P20",
    "Samsung Galaxy SII 64GB 4GB RAM",
    "ZTE Blade V9 Smartphone (14,5cm (5,7 Zoll) Display, 32 GB interner Speicher, Android) Schwarz",
    "Xiaomi Redmi Note 5 64Gb Negro",
    "Huawei P Smart 2019 Aurora blau 6,21\" 64GB 3GB RAM Dual-SIM",
    "iPhone 7 32GB - Gold",
    "Apple iPhone 7 Plus 128 GB Silber",
    "Xiaomi Redmi Go - Smartphone (1 GB de RAM, 8 GB de ROM), Color Negro",
    "Smmartphone Xiaomi Redmi Go 5.0\" 1GB 8GB Dual SIM Azul",
    "HUAWEI P30 Pro 6GB + 128GB, Aurora",
    "Gigaset GS100 Smartphone lemon green",
]

for title in sample_titles:
    splitted_title = title.split()
    title_featured = title2features(splitted_title)
    # TODO preprocess title
    labels = crf.predict_single(title_featured)
    #prob = crf.predict_marginals_single(title_featured)
    print(splitted_title)
    print(labels)
    #print(prob)

['Apple', 'iPhone', '4s', '4G', '64GB', 'BLUE']
['B-BRAND', 'B-MODEL', 'O', 'O', 'O', 'O']
['Huawei', 'Mate', '20', 'Lite']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL']
['Huawei', 'P20']
['B-BRAND', 'B-MODEL']
['Samsung', 'Galaxy', 'SII', '64GB', '4GB', 'RAM']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'B-RAM', 'O']
['ZTE', 'Blade', 'V9', 'Smartphone', '(14,5cm', '(5,7', 'Zoll)', 'Display,', '32', 'GB', 'interner', 'Speicher,', 'Android)', 'Schwarz']
['B-BRAND', 'O', 'B-MODEL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-COLOR']
['Xiaomi', 'Redmi', 'Note', '5', '64Gb', 'Negro']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'B-COLOR']
['Huawei', 'P', 'Smart', '2019', 'Aurora', 'blau', '6,21"', '64GB', '3GB', 'RAM', 'Dual-SIM']
['B-BRAND', 'B-MODEL', 'O', 'O', 'O', 'B-COLOR', 'O', 'O', 'B-RAM', 'O', 'O']
['iPhone', '7', '32GB', '-', 'Gold']
['B-MODEL', 'I-MODEL', 'O', 'O', 'B-COLOR']
['Apple', 'iPhone', '7', 'Plus', '128', 'GB', 'Silber']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL', '

In [51]:
CRF_MODEL_OUTPUT_FILE = "../_models/crf.joblib"

# Dump CRF model to file
dump(crf, CRF_MODEL_OUTPUT_FILE)

['../_models/crf.joblib']

<h2>Observations</h2>