In [331]:
import numpy as np
import pandas as pd
import re
from collections import OrderedDict
from collections import Counter
from itertools import combinations
from math import log
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# DATA

In [332]:
product_descriptions = pd.read_csv('DATA/product_descriptions.csv', header=0, encoding = "ISO-8859-1")
attributes = pd.read_csv('DATA/attributes.csv', header=0, encoding = "ISO-8859-1")
train = pd.read_csv('DATA/train.csv', header=0,  encoding = "ISO-8859-1")
test = pd.read_csv('DATA/test.csv', header=0,  encoding = "ISO-8859-1")

# EVALUATOR SCORES (NORMALIZED)
* Evaluator scores are are continuous averages on the interval [1, 3]; the classifier seems unable to handles this
* relevance_rounded consists of values: 1, 2, 3

In [333]:
# Since relevance is an aggregate, values are continuous rather than discrete
# It appears the classifier can only handle a limited number of values
# So we round up
train['relevance_rounded'] = train['relevance'].map(lambda x: round(x))

* Note imbalance of 1's, 2's, and 3's

In [334]:
Counter(train['relevance_rounded'])

Counter({1.0: 5115, 2.0: 34614, 3.0: 34338})

* relevance_rounded_quarters consists of values 1, 1.25, 1.5...3

In [335]:
def round_custom(x):
    '''Rounds relevance to quarters -- for greater granularity'''
    whole, fraction = int(x), 100 * float(x - int(x))

    if fraction < 12.5:   fraction =  0
    elif fraction < 37.5: fraction = 25
    elif fraction < 62.5: fraction = 50
    elif fraction < 87.5: fraction = 75
    else: fraction = 100

    return whole + fraction / 100

In [336]:
train['relevance_rounded_quarters'] = train['relevance'].map(lambda x: round_custom(x))

# FEATURES

### FEATURE: EXACT MATCH
* between search term (as phrase) and product title, allowing only differences in case

In [337]:
# Produces a boolean

train['exact'] = pd.Series([True if st.lower() in pt.lower() else False 
                      for st, pt in zip(train['search_term'], train['product_title'])])

### FEATURE: OVERLAPPING WORDS
* allowing for differences in case

In [338]:
# Produces a count >= 0

train['overlapping_words'] = pd.Series([len(set(st.lower().split()) & set(pt.lower().split()))
                      for st, pt in zip(train['search_term'], train['product_title'])])

### FEATURE: PERCENTAGE OVERLAPPING WORDS
* allowing for differences in case

In [339]:
train['percentage_overlapping_words'] = pd.Series([len(set(st.lower().split()) & 
                                                       set(pt.lower().split()))/float(len(st.split()))
                                                       for st, pt in zip(train['search_term'], train['product_title'])])

## TF-IDF

In [340]:
def calc(doc, query, N=1, D={}):
    '''Calculates TF-IDF score for doc, query pair, given a document count (N) and dictionary of word counts (D)'''
    doc = [w.lower() for w in doc.split()]
    counts = Counter(doc)
    query = query.lower().split()
    return sum([counts[word] * log(N / (1+D[word])) for word in set(query) & set(doc)]) # 1 to avoid div by 0

### TF-IDF: product descriptions (all) + product titles

In [341]:
# Number of documents
N = len(set(product_descriptions['product_uid'] + train['product_uid']))

In [342]:
# Dictionary of word-count pairs
words_product_descriptions = ''.join(product_descriptions['product_description']).split()
words_train = ''.join(train['product_title']).split()
D = Counter(words_product_descriptions + words_train) # Not lowercased

In [343]:
# Test
# frequent = sorted(d.items(), key=lambda x: x[1])[:50:-1]
# frequent

In [344]:
scores = OrderedDict()

for row in zip(train['id'], train['product_uid'], train['product_title'], train['search_term']):
    _id, pid, pt, st = row
    TF = Counter(pt.split())
    score = sum([calc(pt, word, N, D) for word in st.lower().split()])
    scores[_id] = score

In [345]:
# Vector
tf_idf = list(scores.values())
train['tf_idf'] = tf_idf
train.ix[0]

id                                                              2
product_uid                                                100001
product_title                   Simpson Strong-Tie 12-Gauge Angle
search_term                                         angle bracket
relevance                                                       3
relevance_rounded                                               3
relevance_rounded_quarters                                      3
exact                                                       False
overlapping_words                                               1
percentage_overlapping_words                                  0.5
tf_idf                                                    4.16932
Name: 0, dtype: object

## TF-IDF: product titles, descriptions, and attributes

In [346]:
# Dictionary of word-count pairs
words_product_descriptions = ''.join(product_descriptions['product_description']).split()
words_train = ''.join(train['product_title']).split()
words_attributes = ''.join(str(at) for at in attributes['value']).split()
d_titles_descriptions_attr = Counter(words_product_descriptions + words_train + words_attributes) # Not lowercased

In [347]:
d_titles_descriptions_attr['the']

436110

In [348]:
scores_titles_descriptions_attr = OrderedDict()

for row in zip(train['id'], train['product_uid'], train['product_title'], train['search_term'], attributes['value']):
    _id, pid, pt, st, attr = row
    TF = Counter(pt.split())
    score = sum([calc(pt, word, N, d_titles_descriptions_attr) for word in st.lower().split()])
    scores_titles_descriptions_attr[_id] = score

In [349]:
tf_idf_titles_descriptions_attr = list(scores_titles_descriptions_attr.values())
train['tf_titles_descriptions_attr'] = scores_titles_descriptions_attr
train.ix[0]

id                                                              2
product_uid                                                100001
product_title                   Simpson Strong-Tie 12-Gauge Angle
search_term                                         angle bracket
relevance                                                       3
relevance_rounded                                               3
relevance_rounded_quarters                                      3
exact                                                       False
overlapping_words                                               1
percentage_overlapping_words                                  0.5
tf_idf                                                    4.16932
tf_titles_descriptions_attr                                     2
Name: 0, dtype: object

## TF-IDF: product titles alone (no descriptions)

## TF-IDF: product descriptions (not all -- only those with product titles) + product titles

## FEATURE: JACCARD DISTANCE

In [350]:
def jaccard_dist(phrase1, phrase2):
    '''Returns the Jaccard distance for two phrases, eg, search query and product title'''
    lst1, lst2 = set(phrase1), set(phrase2)
    return float(len(lst1 & lst2)) / len(lst1 | lst2)

def words_shared(st, pt):
    st_list = re.split('\s', st.lower())
    pt_list = re.split('\s', pt.lower())
    dist = jaccard_dist(st_list, pt_list)
    return dist

train['jaccard'] = pd.Series([words_shared(st, pt) 
                      for st, pt in zip(train['search_term'], train['product_title'])])

### BAG OF WORDS: product descriptions, attributes, titles -- product by product

## DATA CAPPED
* Where the number of 1's, 2's, and 3's is equal

In [351]:
ones = train[train.relevance_rounded == 1].sample(n=len(train[train.relevance_rounded == 1]))
twos = train[train.relevance_rounded == 2].sample(n=len(train[train.relevance_rounded == 1]))
threes = train[train.relevance_rounded == 3].sample(n=len(train[train.relevance_rounded == 1]))
capped = pd.concat([ones, twos, threes])

# THE MODEL

## Train, test split

In [352]:
# Prepare to divide the dataset into test, train (lifted from sklearn Iris example)
train['is_train'] = np.random.uniform(0, 1, len(train)) <= .75

# Split data into training and test
train_data, test_data = train[train['is_train']==True], train[train['is_train']==False]

In [353]:
from collections import Counter
Counter(list(train_data['relevance_rounded']))

Counter({1.0: 3841, 2.0: 26000, 3.0: 25623})

In [354]:
# from copy import deepcopy
# ones, twos, three = 0, 0, 0
# train_data_capped = deepcopy(train_data)
# test = deepcopy(train_data_capped)[0:2]

# for i in test:
#     if test['relevance_rounded'] == 2:

In [355]:
# Prepare to divide the dataset into test, train (lifted from sklearn Iris example)
capped['is_train'] = np.random.uniform(0, 1, len(capped)) <= .75

# Split data into training and test
train_data_capped, test_data_capped = capped[capped['is_train']==True], capped[capped['is_train']==False]

## F1 Score

In [356]:
def f1(ct, preds, data):
    '''Returns the F1 score given the confusion matrix generated with a set of predictions'''
    index = list(pd.crosstab(data['relevance_rounded'], preds))
    total = 0
    for i in list(ct):
        total += sum(list(ct[i]))

    true_positives, false_positives, true_negatives, false_negatives = 0, 0, 0, 0

    for i in zip(list(ct), range(1, len(ct)+1)): 
        row = list(ct.ix[i[1]])
        column = list(ct[i[0]])
    
        tp = row[i[1]-1]
        true_positives  += tp
        false_negatives += sum(row)-tp
        false_positives += sum(column) - tp
        true_negatives = total - tp
    
    return float(2*true_positives) / (2 * true_positives + false_positives + false_negatives)

In [357]:
train.columns

Index(['id', 'product_uid', 'product_title', 'search_term', 'relevance',
       'relevance_rounded', 'relevance_rounded_quarters', 'exact',
       'overlapping_words', 'percentage_overlapping_words', 'tf_idf',
       'tf_titles_descriptions_attr', 'jaccard', 'is_train'],
      dtype='object')

## RANDOM FOREST

In [358]:
def random_forest(features, train_data, test_data):
    clf = RandomForestClassifier(n_jobs=2, class_weight = 'balanced', oob_score = False, criterion = 'entropy')

    y, _ = pd.factorize(train_data['relevance_rounded']) # 0, 1, 2

    clf.fit(train_data[features], y)

    # Predicting on those features will output predictions that match y
    preds = clf.predict(test_data[features])

    # target_names = test['relevance_rounded']
    target_names = ['1', '2', '3']
    out = [target_names[pred] for pred in preds]

    ct = pd.crosstab(test_data['relevance_rounded'], np.asarray(out), rownames=['actual'], colnames=['preds'])
    
    return (ct, preds)

In [359]:
# Features
train.columns[7:]

Index(['exact', 'overlapping_words', 'percentage_overlapping_words', 'tf_idf',
       'tf_titles_descriptions_attr', 'jaccard', 'is_train'],
      dtype='object')

In [360]:
train.columns[8]

'overlapping_words'

In [362]:
for n in range(1, 5):
    for comb in combinations([7, 8, 9, 10, 11, 12], n):
#         print(comb)
        features = train.columns[list(comb)]
        ct, preds = random_forest(features, train_data, test_data)
        print(comb, [train.columns[f] for f in comb], f1(ct, preds, test_data))
        print(ct)
        print()


(7,) ['exact'] 0.572391281457
preds      2     3
actual            
1         27  1247
2        487  8127
3       1070  7645

(8,) ['overlapping_words'] 0.294253615008
preds      1     2     3
actual                  
1        396   394   484
2       2928  3967  1719
3       2673  4931  1111

(9,) ['percentage_overlapping_words'] 0.289469440413
preds      1     2     3
actual                  
1        240   255   779
2       2237  2831  3546
3       1844  4557  2314

(10,) ['tf_idf'] 0.248615814654
preds      1     2     3
actual                  
1        299   167   808
2       3132  2490  2992
3       2315  4564  1836

(11,) ['tf_titles_descriptions_attr'] 0.257270332742
preds      1     2    3
actual                 
1        656   514  104
2       4413  3654  547
3       4177  4062  476

(12,) ['jaccard'] 0.328979196904
preds      1     2     3
actual                  
1        238   325   711
2       1809  3556  3249
3       1636  4753  2326

(7, 8) ['exact', 'overlapping_words'

## RANDOM FOREST WITH CAPPED DATA

In [363]:
test_data[features].head()

Unnamed: 0,percentage_overlapping_words,tf_idf,tf_titles_descriptions_attr,jaccard
0,0.5,4.169321,2,0.2
6,0.666667,7.352333,20,0.125
11,0.8,30.591369,35,0.222222
17,0.0,0.0,69,0.0
20,0.0,0.0,85,0.0


In [364]:
features = train.columns[[7, 8, 9, 10]]
test_data_capped[test_data_capped.is_train == False].head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,relevance_rounded,relevance_rounded_quarters,exact,overlapping_words,percentage_overlapping_words,tf_idf,tf_titles_descriptions_attr,jaccard,is_train
66814,200791,189611,Ekena Millwork 3/8 in. x 173 in. x 8-5/8 in. P...,model 173k,1.33,1,1.25,False,0,0.0,0.0,200791,0.0,False
63026,189985,181546,Access Lighting Nauticus 1-Light Black Outdoor...,outdoor black deck,1.33,1,1.25,False,2,0.666667,4.960553,189985,0.166667,False
31821,97351,130320,Ideal Pet 6.25 in. x 6.25 in. Small Cat Flap P...,tru frame windows,1.33,1,1.25,False,1,0.333333,2.690626,97351,0.045455,False
59128,178574,173558,BLACK+DECKER 120 MPH 120 CFM 20-Volt Lithium-I...,black and decker cordless lantern,1.33,1,1.25,False,1,0.2,4.198324,178574,0.058824,False
59121,178555,173547,Home Decorators Collection Albright 31 in. Van...,stone are mb11,1.0,1,1.0,False,1,0.333333,3.443702,178555,0.058824,False


In [367]:
train_data.ix[1]

id                                                              3
product_uid                                                100001
product_title                   Simpson Strong-Tie 12-Gauge Angle
search_term                                             l bracket
relevance                                                     2.5
relevance_rounded                                               2
relevance_rounded_quarters                                    2.5
exact                                                       False
overlapping_words                                               0
percentage_overlapping_words                                    0
tf_idf                                                          0
tf_titles_descriptions_attr                                     3
jaccard                                                         0
is_train                                                     True
Name: 1, dtype: object

In [368]:
train_data_capped.ix[60448]

id                                                                         182458
product_uid                                                                176233
product_title                   Woodgrain Millwork WM 327 - 11/16 in. x 2-1/4 ...
search_term                                                          2 wooden leg
relevance                                                                    1.33
relevance_rounded                                                               1
relevance_rounded_quarters                                                   1.25
exact                                                                       False
overlapping_words                                                               0
percentage_overlapping_words                                                    0
tf_idf                                                                          0
tf_titles_descriptions_attr                                                182458
jaccard         

In [369]:
train_data_capped.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,relevance_rounded,relevance_rounded_quarters,exact,overlapping_words,percentage_overlapping_words,tf_idf,tf_titles_descriptions_attr,jaccard,is_train
25332,77768,122472,Blue Bidet Non-Electric Hot and Cold Dual Nozz...,electric roof system,1.0,1,1.0,False,1,0.333333,2.218892,77768,0.058824,True
71846,215092,201196,Bell&#39;O Fixed Low Profile Wall Mount for 12...,flat screen fireplace,1.33,1,1.25,False,2,0.666667,6.921108,215092,0.111111,True
40097,122325,141678,Glacier Bay Single-Handle Replacement Filtrati...,ac replacement pullout handle,1.33,1,1.25,False,1,0.25,3.340533,122325,0.083333,True
72972,218304,203904,Brady 8 in. x 15 in. Glow-in-the-Dark Self-Sti...,purple glow stick,1.0,1,1.0,False,0,0.0,0.0,218304,0.0,True
56320,170467,168263,5/8 in. x 5-1/2 in. x 8 ft. Pressure-Treated P...,pet treated carpet,1.0,1,1.0,False,0,0.0,0.0,170467,0.0,True


In [371]:
clf = RandomForestClassifier(n_jobs=2, class_weight = 'balanced', oob_score = False, criterion = 'entropy')

y, _ = pd.factorize(train_data_capped['relevance_rounded']) # 0, 1, 2
print (len(y))
clf.fit(train_data_capped[features], y)

# Predicting on those features will output predictions that match y
preds = clf.predict(test_data_capped[features])

# target_names = test['relevance_rounded']
target_names = ['1', '2', '3']
out = [target_names[pred] for pred in preds]

ct = pd.crosstab(test_data_capped['relevance_rounded'], np.asarray(out), rownames=['actual'], colnames=['preds'])

print(ct)
print(preds)

11591
preds     1    2    3
actual               
1       807  240  157
2       551  377  344
3       350  314  614
[0 2 1 ..., 0 0 0]


In [374]:
features

Index(['exact', 'overlapping_words', 'percentage_overlapping_words', 'tf_idf'], dtype='object')

In [372]:
f1(ct, preds, test_data_capped)

0.47895578050079912

In [373]:
ct

preds,1,2,3
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,807,240,157
2,551,377,344
3,350,314,614


# NAIVE BAYES

In [375]:
train.columns

Index(['id', 'product_uid', 'product_title', 'search_term', 'relevance',
       'relevance_rounded', 'relevance_rounded_quarters', 'exact',
       'overlapping_words', 'percentage_overlapping_words', 'tf_idf',
       'tf_titles_descriptions_attr', 'jaccard', 'is_train'],
      dtype='object')

In [376]:
def naive_bayes(features, train_data, test_data):
    clf = GaussianNB()
    y, _ = pd.factorize(train_data['relevance_rounded']) # 0, 1, 2

    clf.fit(train_data[features], y)

    # Predicting on those features will output predictions that match y
    preds = clf.predict(test_data[features])

    # target_names = test['relevance_rounded']
    target_names = ['1', '2', '3']
    out = [target_names[pred] for pred in preds]

    ct = pd.crosstab(test_data['relevance_rounded'], np.asarray(out), rownames=['actual'], colnames=['preds'])
    
    return(ct, preds)

In [392]:
len(train_data)

55464

In [393]:
len(train)

74067

In [394]:
len(preds)

3754

In [377]:
# features = train.columns[['exact']] # Whether it's an exact match or not

# features = ['exact']
features = [7, 8, 9, 10]
ct, preds = naive_bayes(features, train_data, test_data)
ct

preds,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1150,124
2,6604,2010
3,5252,3463


In [379]:
f1(ct, preds, test_data)

0.22182443578673966

In [380]:
Counter(preds)

Counter({0: 13006, 1: 5597})

In [381]:
print(len(preds), len(train))

18603 74067


In [388]:
ct_capped, preds_capped = naive_bayes(features, train_data_capped, test_data_capped)

In [384]:
len(preds)

18603

In [385]:
len(test_data_capped)

3754

In [389]:
f1(ct_capped, preds_capped, test_data_capped)

0.43127330847096429

In [390]:
len(train)

74067

In [391]:
len(preds_capped)

3754

# THINGS TO TRY / CONTEMPLATE

In [72]:
# What do 3's have in common?
# What do 1's have in common?
# QUERIES CONTAINING WORDS NOT IN DICTIONARY

## TF-IDF sans stopwords (does it make any difference?)

In [208]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')