<h2>Conditional Random Fields (CRF) to extract features from Smartphones product offers</h2>

<h3>Data Preparation</h3>

In [3]:
from sklearn import model_selection, metrics
from joblib import dump
from sklearn_crfsuite import metrics
import sklearn_crfsuite
import scipy
import sklearn
import pandas as pd
import time

In [4]:
BIO_ENCODED_PRODUCT_TITLES_PATHFILE = "../_data/BIO_ENCODED_TITLES.csv"

def get_bio_encoded_titles(file_path):
    '''
    Returns the features for the product titles (i.e. the words) as a list of 
    lists of strings and the labels for these features as a list of lists of strings.
    '''
    features = []
    labels = []
    classes = []

    bio_titles_df = pd.read_csv(BIO_ENCODED_PRODUCT_TITLES_PATHFILE, encoding='iso-8859-1')
    classes = bio_titles_df["BIOTag"].unique()

    for titleNum in bio_titles_df["TitleNumber"].unique():
        title_features = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "Word"].tolist()
        title_labels = bio_titles_df.loc[bio_titles_df["TitleNumber"] == titleNum, "BIOTag"].tolist()

        features.append(title_features)
        labels.append(title_labels)

    return features, labels, classes

In [5]:
start_time = time.time()
bio_encoded_titles = get_bio_encoded_titles(BIO_ENCODED_PRODUCT_TITLES_PATHFILE)
elapsed_time = round(time.time() - start_time, 3)

print("BIO-encoded titles collected. Elapsed time (s): {}".format(elapsed_time))
print("Number of BIO-encoded titles collected: {}\n".format(len(bio_encoded_titles[0])))

BIO-encoded titles collected. Elapsed time (s): 401.141
Number of BIO-encoded titles collected: 87711



In [9]:
# Show an example of a BIO-encoded product title by BIOTagger
i = 3000
print("BIO-encoded title i = {} ".format(i))
print("Features = {}".format(str(bio_encoded_titles[0][i])))
print("Labels = {}".format(str(bio_encoded_titles[1][i])))

BIO-encoded title i = 3000 
Features = ['samsung', 'galaxy', 's7', 'edge', 'negro']
Labels = ['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL', 'B-COLOR']


In [10]:
def word2features(product_title, i):
    '''
    A product title is received as a list of words (i.e. strings).
    '''
    word = product_title[i]

    features = {
        'word.lower()': word.lower(),
        'word.isdigit()': word.isdigit(),                                    # Digits only
        'word.isalpha()': word.isalpha(),                                    # Alphabetic characters only
        'word.isAlphaNumeric()': word.isalnum(),                             # Alphabetic or digits only
        'word.containsdigit()': any(char.isdigit() for char in word),
        #'word.containsNonAlphanumericChars()': not word.isalnum(),
        'word.posFromBeginning()': i,
        'word.posFromEnd()': len(product_title) - (i + 1),
    }
    
    # The word is not the beggining of a product title
    if i > 0:
        preceding_word = product_title[i-1]
        features.update({
            '-1:word.lower()': preceding_word.lower(),
            '-1:word.isalpha()': preceding_word.isalpha(),
            '-1:word.isdigit()': preceding_word.isdigit(),
            '-1:word.isAlphaNumeric()': preceding_word.isalnum(),
        })
        
    # The word is the beginning of a product title
    else:
        features['BOT'] = True

    # The word is not the end of a product title
    if i < len(product_title) - 1:
        subsequent_word = product_title[i+1]
        features.update({
            '+1:word.lower()': subsequent_word.lower(),
            '+1:word.isalpha()': subsequent_word.isalpha(),
            '+1:word.isdigit()': subsequent_word.isdigit(),
            '+1:word.isAlphaNumeric()': subsequent_word.isalnum(),
        })
        
    # The word is not the end of a product title
    else:
        features['EOT'] = True

    return features


def title2features(product_title):
    return [word2features(product_title, i) for i in range(len(product_title))]

In [11]:
features = [title2features(title) for title in bio_encoded_titles[0]]
labels = bio_encoded_titles[1]
classes = bio_encoded_titles[2]

X_train, X_test, y_train, y_test = model_selection.train_test_split(features, labels, test_size=0.30, random_state=0)

print("X_train length: {}".format(len(X_train)))
print("y_train length: {}".format(len(y_train)))
print("X_test length: {}".format(len(X_test)))
print("y_test length: {}".format(len(y_test)))

X_train length: 61397
y_train length: 61397
X_test length: 26314
y_test length: 26314


<h3>Training</h3>

In [12]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.15,
    max_iterations=200,
    all_possible_transitions=True
)

start_time = time.time()
crf.fit(X_train, y_train)
elapsed_time = round(time.time() - start_time, 3)

print("CRF model training finished. Elapsed time (s): {}".format(elapsed_time))

y_pred = crf.predict(X_test)

CRF model training finished. Elapsed time (s): 117.876


<h3>Evaluation</h3>

In [13]:
# Remove 'O'-tagged words, as most of the words will be 'O'-tagged
# and the results will look much better than what they actually are
labels = list(crf.classes_)
labels.remove('O')

print(metrics.flat_classification_report(y_test, y_pred, labels = labels))

              precision    recall  f1-score   support

     B-BRAND       1.00      1.00      1.00     26239
     B-MODEL       1.00      0.99      1.00     25905
     I-MODEL       1.00      1.00      1.00     25554
     B-COLOR       1.00      1.00      1.00     20889
       B-RAM       0.97      0.96      0.96      1980
       I-RAM       0.99      1.00      0.99      1133

   micro avg       1.00      1.00      1.00    101700
   macro avg       0.99      0.99      0.99    101700
weighted avg       1.00      1.00      1.00    101700



In [14]:
import sys

sys.path.append('../common')
import utils

# Predict some examples
sample_titles = [
    "Apple iPhone 4s 4G 64GB BLUE",
    "Huawei Mate 20 Lite",
    "Huawei P20",
    "Samsung Galaxy SII 64GB 4GB RAM",
    "ZTE blade V9 Smartphone (14,5cm (5,7 Zoll) Display, 32 GB interner Speicher, Android) Schwarz",
    "Xiaomi Redmi Note 5 64Gb Negro",
    "Huawei P Smart 2019 Aurora blau 6,21\" 64GB 3GB RAM Dual-SIM",
    "iPhone 7 32GB - Gold",
    "Apple iPhone 7 Plus 128 GB Silber",
    "Xiaomi Redmi Go - Smartphone (1 GB de RAM, 8 GB de ROM), Color Negro",
    "Smmartphone Xiaomi Redmi Go 5.0\" 1GB 8GB Dual SIM Azul",
    "HUAWEI P30 Pro 6GB + 128GB, Aurora",
    "Gigaset gs100 Smartphone lemon green",
    "Apple iPhone 4 8GB SIM-Free - Black",
    "apple iphone 4s plus Smartphone with some 4 GB RAM - blue",
]

for title in sample_titles:
    splitted_title = title.split()
    cleaned_title = utils.preprocess(splitted_title, with_rows_removal=False)
    title_featured = title2features(cleaned_title)
    # TODO preprocess title
    labels = crf.predict_single(title_featured)
    print(cleaned_title)
    print(labels)

['apple', 'iphone', '4s', '4g', '64gb', 'blue']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'O']
['huawei', 'mate', '20', 'lite']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MODEL']
['huawei', 'p20']
['B-BRAND', 'B-MODEL']
['samsung', 'galaxy', 'sii', '64gb', '4gb', 'ram']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'B-RAM', 'O']
['zte', 'blade', 'v9', 'smartphone', '145cm', '57', 'zoll', 'display', '32', 'gb', 'interner', 'speicher', 'android', 'schwarz']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-COLOR']
['xiaomi', 'redmi', 'note', '5', '64gb', 'negro']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'B-COLOR']
['huawei', 'p', 'smart', '2019', 'aurora', 'blau', '621"', '64gb', '3gb', 'ram', 'dualsim']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'O', 'O', 'B-COLOR', 'O', 'O', 'B-RAM', 'O', 'O']
['iphone', '7', '32gb', '', 'gold']
['B-MODEL', 'I-MODEL', 'O', 'O', 'B-COLOR']
['apple', 'iphone', '7', 'plus', '128', 'gb', 'silber']
['B-BRAND', 'B-MODEL', 'I-MODEL', 'I-MO

In [15]:
CRF_MODEL_OUTPUT_FILE = "../_models/crf_smartphones.joblib"

# Dump CRF model to file
dump(crf, CRF_MODEL_OUTPUT_FILE)

['../_models/crf_smartphones.joblib']

<h2>Observations</h2>

In [None]:
What the model learnt. Comentar por encima...