# Selecting best book title from multiple candidates

In [183]:
import pandas as pd
import sklearn

## Sample data

In [184]:
titles = pd.read_csv('./sample_multivendor_responses.csv').drop_duplicates()
titles.head()

Unnamed: 0,vendor,isbn,class,title
0,nielsen,9780194420983,2,Practical English Usage: Paperback
1,worldcat,9780194420983,0,"Practical English usage : [easier, faster refe..."
2,open-library,9780194420983,1,Practical English Usage
3,outpan,9780194420983,1,Practical English Usage
4,nielsen,9781589016811,2,"Just War: Authority, Tradition, and Practice"


## Manually classified data

In [185]:
training_data = titles[titles['class'].isin([0, 1, 2])]
training_data = training_data[['class', 'title']].drop_duplicates()
training_data.head()

Unnamed: 0,class,title
0,2,Practical English Usage: Paperback
1,0,"Practical English usage : [easier, faster refe..."
2,1,Practical English Usage
4,2,"Just War: Authority, Tradition, and Practice"
5,0,"Just war authority, tradition, and practice"


## Unclassified data for testing

In [186]:
test_data = titles[titles['class'].isnull()][['isbn', 'title']].drop_duplicates()
test_data.head()

Unnamed: 0,isbn,title
310,9780273750758,Research Methods for Business Students
311,9780273750758,Research methods for business students
314,9781447143994,Quad Rotorcraft Control
315,9781447143994,Quad rotorcraft control vision-based hovering ...
316,9780754631323,Pop Music: Technology and Creativity - Trevor ...


## Feature extraction

In [187]:
import numpy as np
from collections import Counter
from string import punctuation, whitespace
import re
from functools import reduce

def feature_extract(title):
    features = []

    # Title length
    features.append(len(title))

    # Does it start with uppercase?
    features.append(int(title[0:1].isupper()))

    # How many punctuation characters?
    features.append(
        sum([v for k, v in Counter(title).items() if k in punctuation])
    )

    # How many less common punctuation characters?
    features.append(
        sum([v for k, v in Counter(title).items() if (k in punctuation) and (k not in '.,:\'')])
    )

    # Does it end with punctuation?
    features.append(
        int(len(title) and title[-1] in punctuation)
    )

    # How many whitespace characters?
    features.append(
        sum([v for k, v in Counter(title).items() if k in whitespace])
    )

    # How many whitespace separated words?
    features.append(len(title.split()))

    # What's an average length of uppercase words?
    upper_lengths = list(map(len, filter(lambda t: t[:1].isalpha() and t[:1].isupper(), title.split())))
    if len(upper_lengths):
        features.append(np.average(upper_lengths))
    else:
        features.append(0.0)

    # What's an average length of lowercase words?
    lower_lengths = list(map(len, filter(lambda t: t[:1].isalpha() and t[:1].islower(), title.split())))
    if len(lower_lengths):
        features.append(np.average(lower_lengths))
    else:
        features.append(0.0)

    # Does it split into title / subtitle?
    title_subtitle = list(filter(lambda e: e, title.split(': ')))

    features.append(
        len(title_subtitle)
    )

    # Assuming it splits into title/subtitle, is subtitle starts with uppercase?
    features.append(
        len(title_subtitle) > 1 and title_subtitle[1][0:1].isupper()
    )

    # Is title/subtitel split by " : "?
    features.append(
        int(" : " in title)
    )

    # Does it have any kind of text in parenthesis?
    features.append(
        len(re.findall("\[[^\]]*\]|\([^)]*\)|<[^>]*>|\{[^\}]*\}", title))
    )

    # Uppercase letter 'share'
    uppercase = reduce(lambda l, e: l + 1, filter(lambda l: l.isalpha() and l.isupper(), title), 0)
    if uppercase > 0:
        features.append(float(len(title)) / float(uppercase))
    else:
        features.append(0.0)

    # Lowercase letter 'share'
    lowercase = reduce(lambda l, e: l+1, filter(lambda l: l.isalpha() and l.islower(), title), 0)
    if lowercase > 0:
        features.append(float(len(title)) / float(lowercase))
    else:
        features.append(0.0)

    return features


## Training

In [188]:
from sklearn.ensemble import RandomForestClassifier

clf = sklearn.ensemble.RandomForestClassifier(min_samples_split=4)

In [189]:
classes = [int(c) for (c, title) in training_data.as_matrix()]
features = [feature_extract(title) for (c, title) in training_data.as_matrix()]
clf.fit(features, classes)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=4, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

## Example predictions

In [190]:
clf.predict_proba(feature_extract("Something Something Title"))

array([[ 0.,  1.,  0.]])

In [191]:
clf.predict_proba(feature_extract("Something: A Guide to Nothing"))

array([[ 0.03333333,  0.08333333,  0.88333333]])

In [192]:
clf.predict_proba(feature_extract("probably not good at all"))

array([[ 0.6,  0.4,  0. ]])

In [193]:
clf.predict_proba(feature_extract("Too Much Information, 2005, Publisher, New York"))

array([[ 0.56666667,  0.36666667,  0.06666667]])

In [194]:
clf.predict_proba(feature_extract(""))

array([[ 0.,  1.,  0.]])

## Picking best from multiple candidates

In [195]:
def pick_one(titles):
    probs = [(title, clf.predict_proba(feature_extract(title))[0]) for title in titles]
    #probs = sorted(probs, key=lambda p: (-max(p[1][1], p[1][2]), p[1][0]))
    probs = sorted(
        probs,
        key=lambda p: (-max(p[1][1], p[1][2]), -min(p[1][1], p[1][2]), p[1][0])
    )
    return probs[0][0]

In [196]:
pick_one(["Something Something Title", "Something: A Guide to Nothing", "probably not good at all"])

'Something Something Title'

## Running on our test set

Group by ISBN and select ones that have a choice of at least 2 options

In [197]:
from itertools import groupby

grouped = groupby(
    [(r[1], r[2]) for r in test_data.to_records()],
    lambda v: v[0]
)

grouped = dict([(k,[t[1] for t in list(v)]) for (k,v) in grouped])
grouped = dict([(k,v) for (k,v) in grouped.items() if len(v) > 1])

In [198]:
for (isbn, titles) in grouped.items():
    print("\n\nCandidates:")
    print("\n".join(titles))
    print("Winner:")
    print(pick_one(titles))
    



Candidates:
Cases and Materials on Constitutional and Administrative Law
Cases and materials on constitutional and administrative law
Winner:
Cases and Materials on Constitutional and Administrative Law


Candidates:
Assessment for learning and teaching in primary schools
Assessment for Learning and Teaching in Primary Schools
Winner:
Assessment for Learning and Teaching in Primary Schools


Candidates:
Echoes of History, Shadowed Identities: Rewriting Alterity in J. M. Coetzee's Foe and Marina Warner's Indigo
Echoes of History, Shadowed Identities: Rewriting Alterity in J. M. Coetzee's Foe and Maria Warner's Indigo
Echoes of history, shadowed identities
Winner:
Echoes of History, Shadowed Identities: Rewriting Alterity in J. M. Coetzee's Foe and Marina Warner's Indigo


Candidates:
Housework and Housewives in American Advertising: Married to the Mop
Housework and Housewives in Modern American Advertising
Housework and housewives in American advertising
Winner:
Housework and Housewiv