# HOME DEPOT KAGGLE COMPETITION

In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from collections import OrderedDict
from itertools import combinations
from math import log
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

## LOAD DATA

In [2]:
product_descriptions = pd.read_csv('DATA/product_descriptions.csv', header=0, encoding = "ISO-8859-1")
attributes = pd.read_csv('DATA/attributes.csv', header=0, encoding = "ISO-8859-1")
train = pd.read_csv('DATA/train.csv', header=0,  encoding = "ISO-8859-1")
test = pd.read_csv('DATA/test.csv', header=0,  encoding = "ISO-8859-1")

## NORMALIZE EVALUATOR SCORES
* Evaluator scores are are continuous averages on the interval [1, 3]; the classifier seems unable to handles this
* relevance_rounded consists of values: 1, 2, 3

In [3]:
# Since relevance is an aggregate, values are continuous rather than discrete
# It appears the classifier can only handle a limited number of values
# So we round up
train['relevance_rounded'] = train['relevance'].map(lambda x: round(x))

* Note imbalance of 1's, 2's, and 3's:

In [4]:
Counter(train['relevance_rounded'])

Counter({1.0: 5115, 2.0: 34614, 3.0: 34338})

## CREATE FEATURES

### FEATURE: EXACT MATCH (_boolean_)
* between search term (as phrase) and product title, allowing only differences in case

In [5]:
train['exact'] = pd.Series([True if st.lower() in pt.lower() else False 
                      for st, pt in zip(train['search_term'], train['product_title'])])

### FEATURE: OVERLAPPING WORDS (_count >= 0_)
* between search term (as phrase) and product title, allowing only differences in case

In [6]:
train['overlapping_words'] = pd.Series([len(set(st.lower().split()) & set(pt.lower().split()))
                      for st, pt in zip(train['search_term'], train['product_title'])])

### FEATURE: OVERLAPPING WORDS (_percentage_)
* allowing for differences in case

In [7]:
train['percentage_overlapping_words'] = pd.Series([len(set(st.lower().split()) & 
                                                       set(pt.lower().split()))/float(len(st.split()))
                                                       for st, pt in zip(train['search_term'], train['product_title'])])

## TF-IDF

In [8]:
def calc(doc, query, N=1, D={}):
    '''Calculates TF-IDF score for doc, query pair, given a document count (N) and dictionary of word counts (D)'''
    doc = [w.lower() for w in doc.split()]
    counts = Counter(doc)
    query = query.lower().split()
    return sum([counts[word] * log(N / (1+D[word])) for word in set(query) & set(doc)]) # 1 to avoid div by 0

### FEATURE: TF-IDF: product descriptions (all) + product titles

In [9]:
# Number of documents
N = len(set(product_descriptions['product_uid'] + train['product_uid']))

# Dictionary of word-count pairs
words_product_descriptions = ''.join(product_descriptions['product_description']).split()
words_train = ''.join(train['product_title']).split()
D = Counter(words_product_descriptions + words_train) # Not lowercased

scores = OrderedDict()
for row in zip(train['id'], train['product_uid'], train['product_title'], train['search_term']):
    _id, pid, pt, st = row
    TF = Counter(pt.split())
    score = sum([calc(pt, word, N, D) for word in st.lower().split()])
    scores[_id] = score

# Vector
train['tf_idf'] = list(scores.values())

### FEATURE: TF-IDF: product titles, descriptions, and attributes

In [10]:
# Dictionary of word-count pairs
words_product_descriptions = ''.join(product_descriptions['product_description']).split()
words_train = ''.join(train['product_title']).split()
words_attributes = ''.join(str(at) for at in attributes['value']).split()
d_titles_descriptions_attr = Counter(words_product_descriptions + words_train + words_attributes) # Not lowercased

scores_titles_descriptions_attr = OrderedDict()

for row in zip(train['id'], train['product_uid'], train['product_title'], train['search_term'], attributes['value']):
    _id, pid, pt, st, attr = row
    TF = Counter(pt.split())
    score = sum([calc(pt, word, N, d_titles_descriptions_attr) for word in st.lower().split()])
    scores_titles_descriptions_attr[_id] = score

train['tf_titles_descriptions_attr'] = list(scores_titles_descriptions_attr.values())


### FEATURE: JACCARD DISTANCE

In [11]:
def jaccard_dist(phrase1, phrase2):
    '''Returns the Jaccard distance for two phrases, eg, search query and product title'''
    lst1, lst2 = set(phrase1), set(phrase2)
    return float(len(lst1 & lst2)) / len(lst1 | lst2)

def words_shared(st, pt):
    st_list = re.split('\s', st.lower())
    pt_list = re.split('\s', pt.lower())
    dist = jaccard_dist(st_list, pt_list)
    return dist

train['jaccard'] = pd.Series([words_shared(st, pt) 
                      for st, pt in zip(train['search_term'], train['product_title'])])

## DATA CAPPED
* Where the number of 1's, 2's, and 3's is equal
* Capping at this stage so that all the feature vectors are captured

In [12]:
ones = train[train.relevance_rounded == 1].sample(n=len(train[train.relevance_rounded == 1]))
twos = train[train.relevance_rounded == 2].sample(n=len(train[train.relevance_rounded == 1]))
threes = train[train.relevance_rounded == 3].sample(n=len(train[train.relevance_rounded == 1]))
capped = pd.concat([ones, twos, threes])

# SPLIT THE DATA, DEVISE A METRIC, and CREATE MODELS

## Split the data into train & test, whole (uncapped) set and _capped_ (subset)

### Uncapped (_uneven_ number of 1's, 2's, and 3's)

In [13]:
# Prepare to divide the dataset into test, train (lifted from sklearn Iris example)
train['is_train'] = np.random.uniform(0, 1, len(train)) <= .75

# Split data into training and test
train_data, test_data = train[train['is_train']==True], train[train['is_train']==False]

In [14]:
Counter(list(train_data['relevance_rounded']))

Counter({1.0: 3837, 2.0: 25929, 3.0: 25766})

### Capped (_even_ number of 1's, 2's and 3's)

In [15]:
# Prepare to divide the dataset into test, train (lifted from sklearn Iris example)
capped['is_train'] = np.random.uniform(0, 1, len(capped)) <= .75

# Split data into training and test
train_data_capped, test_data_capped = capped[capped['is_train']==True], capped[capped['is_train']==False]

In [16]:
Counter(list(train_data_capped['relevance_rounded']))

Counter({1.0: 3899, 2.0: 3816, 3.0: 3790})

## METRIC: F1 Score

In [17]:
def f1(ct, preds, data):
    '''Returns the F1 score given the confusion matrix generated with a set of predictions'''
    index = list(pd.crosstab(data['relevance_rounded'], preds))
    total = 0
    for i in list(ct):
        total += sum(list(ct[i]))

    true_positives, false_positives, true_negatives, false_negatives = 0, 0, 0, 0

    for i in zip(list(ct), range(1, len(ct)+1)): 
        row = list(ct.ix[i[1]])
        column = list(ct[i[0]])
    
        tp = row[i[1]-1]
        true_positives  += tp
        false_negatives += sum(row)-tp
        false_positives += sum(column) - tp
        true_negatives = total - tp
    
    return float(2*true_positives) / (2 * true_positives + false_positives + false_negatives)

## MODEL 1: RANDOM FOREST

In [18]:
def random_forest(features, training_data, testing_data, y_):
    clf = RandomForestClassifier(n_jobs=2, class_weight = 'balanced', oob_score = False, criterion = 'entropy')

    clf.fit(training_data[features], y_)

    preds = clf.predict(testing_data[features])

    target_names = ['1', '2', '3']
    out = [target_names[pred] for pred in preds]

    ct = pd.crosstab(testing_data['relevance_rounded'], np.asarray(out), rownames=['actual'], colnames=['preds'])
    
    return (ct, preds)

## MODEL 2: NAIVE BAYES

In [19]:
def naive_bayes(features, training_data, testing_data, y_):
    clf = GaussianNB()

    clf.fit(training_data[features], y_)

    preds = clf.predict(testing_data[features])

    target_names = ['1', '2', '3']
    out = [target_names[pred] for pred in preds]

    ct = pd.crosstab(testing_data['relevance_rounded'], np.asarray(out), rownames=['actual'], colnames=['preds'])
    
    return(ct, preds)


# RESULTS

In [20]:
y_uncapped, _ = pd.factorize(train_data['relevance_rounded']) # 0, 1, 2
y_capped, _ = pd.factorize(train_data_capped['relevance_rounded']) # 0, 1, 2


scores = []
for n in range(1, 6):
    for comb in combinations([6, 7, 8, 9, 10, 11], n):
        features = train.columns[list(comb)]
        
        # Random Forest
        rf_ct, rf_preds = random_forest(features, train_data, test_data, y_uncapped)
        rf_ct_capped, rf_preds_capped = random_forest(features, train_data_capped, test_data_capped, y_capped)

        rf_f1_uncapped = f1(rf_ct, rf_preds, test_data)
        rf_f1_capped = f1(rf_ct_capped, rf_preds_capped, test_data_capped)
        
        # Naive Bayes
        nb_ct, nb_preds = naive_bayes(features, train_data, test_data, y_uncapped)
        nb_ct_capped, nb_preds_capped = naive_bayes(features, train_data_capped, test_data_capped, y_capped)

        nb_f1_uncapped = f1(nb_ct, nb_preds, test_data)
        nb_f1_capped = f1(nb_ct_capped, nb_preds_capped, test_data_capped)
        

        scores.append([list(features), 
                       rf_ct, 
                       rf_preds, 
                       rf_f1_uncapped, 
                       rf_ct_capped, 
                       rf_preds_capped, 
                       rf_f1_capped, 
                       
                       nb_ct, 
                       nb_preds, 
                       nb_f1_uncapped, 
                       nb_ct_capped, 
                       nb_preds_capped, 
                       nb_f1_capped])

In [21]:
scores_df = pd.DataFrame(scores, columns = ['features', 'rf_ct', 'rf_preds', 'rf_f1', 
                                            'rf_ct_capped', 'rf_preds_capped', 'rf_f1_capped', 
                                            
                                            'nb_ct', 'nb_preds', 'nb_f1', 
                                            'nb_ct_capped', 'nb_preds_capped', 'nb_f1_capped'])

## MEAN SCORES

### Random Forest Mean F1 for Whole (Uncapped) Data Set 

In [22]:
rf_mean_f1 = sum(scores_df['rf_f1'])/len(scores_df)
rf_mean_f1

0.28067292160986529

### Naive Bayes Mean F1 (Uncapped) Data Set

In [23]:
nb_mean_f1 = sum(scores_df['nb_f1'])/len(scores)
nb_mean_f1

0.43886827696742492

### Random Forest Mean F1 for Capped Data Set

In [24]:
rf_mean_f1_capped = sum(scores_df['rf_f1_capped'])/len(scores_df)
rf_mean_f1_capped

0.45141146877617322

### Naive Bayes Mean F1 for Capped Data Set

In [25]:
nb_mean_f1_capped = sum(scores_df['nb_f1_capped'])/len(scores_df)
nb_mean_f1_capped

0.4299712874778982

### Summary

In [26]:
summary = pd.DataFrame([[rf_mean_f1, rf_mean_f1_capped], [nb_mean_f1, nb_mean_f1_capped]], 
                   index=['Random Forest', 'Naive Bayes'],
                    columns = ['All Data', 'Capped'])
summary

Unnamed: 0,All Data,Capped
Random Forest,0.280673,0.451411
Naive Bayes,0.438868,0.429971


## BEST SCORES: RF vs NB, on WHOLE & CAPPED DATA

### Random Forest Best Score & Corresponding Features

In [27]:
scores_df['rf_ct_width'] = pd.Series([len(np.array(row)[0]) for row in scores_df.rf_ct])
tmp = scores_df[scores_df['rf_ct_width'] > 2] # Classifying into three features
tmp[tmp['rf_f1'] == max(tmp['rf_f1'])][['rf_f1', 'features', 'rf_ct']]

Unnamed: 0,rf_f1,features,rf_ct
6,0.334179,"[exact, overlapping_words]",preds 1 2 3 actual ...


### Random Forest Best _Capped_ Score & Corresponding Features

In [28]:
scores_df[scores_df['rf_f1_capped'] == max(scores_df['rf_f1_capped'])][['rf_f1_capped', 'features', 'rf_ct_capped']]

Unnamed: 0,rf_f1_capped,features,rf_ct_capped
56,0.483073,"[exact, overlapping_words, percentage_overlapp...",preds 1 2 3 actual 1 ...


### Naive Bayes Best Score & Corresponding Features
* (Where the classifier is classifying into three classes)

In [29]:
scores_df['nb_ct_width'] = pd.Series([len(np.array(row)[0]) for row in scores_df.nb_ct])
tmp = scores_df[scores_df['nb_ct_width'] > 2] # Classifying into three features
tmp[tmp['nb_f1'] == max(tmp['nb_f1'])][['nb_f1', 'features', 'nb_ct']]

Unnamed: 0,nb_f1,features,nb_ct
59,0.327057,"[exact, overlapping_words, tf_idf, tf_titles_d...",preds 1 2 3 actual ...


### Naive Bayes Best _Capped_ Score & Corresponding Features

In [30]:
scores_df[scores_df['nb_f1_capped'] == max(scores_df['nb_f1_capped'])][['nb_f1_capped', 'features', 'nb_ct_capped']]

Unnamed: 0,nb_f1_capped,features,nb_ct_capped
15,0.449479,"[percentage_overlapping_words, tf_idf]",preds 1 2 3 actual 1 ...
