# W207 - Home Depot Product Search
## Saad, Kevin, & Umber

* Link to Kaggle Competition Site: https://www.kaggle.com/c/home-depot-product-search-relevance


In [7]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# other useful libraries
import csv
import re
import pandas as pd
from nltk.stem.snowball import SnowballStemmer

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

# Data Loading

This section reads in the data from the main two csv files with the proper encoding and converts everything to lowercase.

In [8]:
# open the train data file
with open('train.csv', 'rb') as csvfile:
    data_iter = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    data = [data for data in data_iter]

# convert strings to lower case
for row in data:
    row[2] = row[2].decode('ISO-8859-1').lower()
    row[3] = row[3].decode('ISO-8859-1').lower()

# load the training data
data_array = np.asarray(data)
train_size = 63068
Y, X = data_array[1:train_size, 4:], data_array[1:train_size, :4]
Y_dev, X_dev = data_array[train_size:, 4:], data_array[train_size:, :4]

Y = np.reshape(Y, (Y.shape[0],))
Y_dev = np.reshape(Y_dev, (Y_dev.shape[0],))

train_labels = np.array(Y, dtype=np.float32)
dev_labels = np.array(Y_dev, dtype=np.float32)

# open the test data
with open('test.csv', 'rb') as csvfile:
    data_iter = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    data = [data for data in data_iter]

# convert strings to lower case
for row in data:
    row[2] = row[2].decode('ISO-8859-1').lower()
    row[3] = row[3].decode('ISO-8859-1').lower()

data_array = np.asarray(data)

# load the test data
X_test = data_array[1:]

print X.shape, X_dev.shape, X_test.shape
print Y.shape, Y_dev.shape

(63067, 4) (11000, 4) (166693, 4)
(63067,) (11000,)


# More Data Loading

The Home Depot data set also includes a group of descriptions and attributes about each product, so this section loads these csv files into dictionaries to aid in feature engineering.

In [9]:
# The data set we were given contains product descriptions as well as attributes for some products
# this section will build a dictionary for both of these to use in analysis
with open('product_descriptions.csv', 'rb') as csvfile:
    data_iter = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    data = [data for data in data_iter]

# dump the headers
data = data[1:]

pd_dict = {}
# build a dictionary from the IDs and descriptions
for row in data:
    pid = int(row[0])
    description = row[1].decode('ISO-8859-1').lower()
    pd_dict[pid] = description

# Now open the attributes. Note that there may be more that one attribute per ID and there may be zero attributes for some IDs
with open('attributes.csv', 'rb') as csvfile:
    data_iter = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    data = [data for data in data_iter]

# dump the headers
data = data[1:]

att_dict = {}
# build a dictionary from the attributes
for row in data:
    # some of the attribute lines are blanks, so check for that and skip
    if row[0] != "":
        pid = int(row[0])
        name = row[1].decode('ISO-8859-1').lower()
        value = row[2].decode('ISO-8859-1').lower()
    
    # if this is a new value, add it to the dictionary and add the name value pair as an inner dictionary
    if pid not in att_dict:
        inner_dict = {}
        inner_dict[name] = value
        att_dict[pid] = inner_dict
    else:
        inner_dict = att_dict[pid]
        inner_dict[name] = value

# Feature Engineering

First, we defined some helper functions to stem the words and make it easier to count matches and approximate matches. Then we applied these techniques to the training data.

In [10]:
# now we need to build the features we're interested in using in our model
from difflib import SequenceMatcher

#find the most similar word in string for each word in query and sum the ratios
def sum_similar(query, string):
    total = 0
    for q in query.split():
        closest = 0
        for s in string.split():
            current = SequenceMatcher(None, q, s).ratio()
            # print 'sequence matcher ouptut: %s, %s, %3d' %(q, s, current)
            if current > closest:
                closest = current
        total += closest
    # print 'sum ratio of "%s" and "%s" is %3d' %(query, string, total)
    return total

def find_query_in_string(query, string):
    count = 0
    for word in query.split():
        if string.find(word)>=0:
            count += 1
    return count

def count_words(query):
    count = 0
    for word in query.split():
        count += 1
    return count

def stem_data(string):
    s = ""
    for word in string.split():
        s += stemmer.stem(word) + " "
    return s

# start with 4 features:
# 1) number of words in query
# 2) number of query words present in title
# 3) sum of the ratio differences of each query word to the title
# 4) number of query words present in description
# 5) number of query words present in the MFG Brand attribute
# 6) sum of the ratio differences of each query word to the MFG Brand attribute
num_features = 6
num_query = 0
num_title = 1
sum_title = 2
num_desc = 3
num_brand = 4
sum_brand = 5

train_data = np.zeros((X.shape[0],num_features))

# use a snowball stemmer from nltk to remove word endings
stemmer = SnowballStemmer('english')

for x, t in zip(X, train_data):
    pid = x[1]
    title = stem_data(x[2])
    query = stem_data(x[3])
    description = stem_data(pd_dict[int(pid)])

    t[num_query] = count_words(query)
    t[num_title] = find_query_in_string(query, title)
    t[sum_title] = sum_similar(query, title)
    t[num_desc] = find_query_in_string(query, description)
    
    if int(pid) in att_dict:
        inner_dict = att_dict[int(pid)]
        brand_idx = "MFG Brand Name"
        if brand_idx in inner_dict:
            t[num_brand] += find_query_in_string(query, stem_data(inner_dict[brand_idx]))
            t[sum_brand] = sum_similar(query, stem_data(inner_dict[brand_idx]))

for t in range(2):
    print X[t], train_data[t]

print train_data.shape

[u'2' u'100001' u'simpson strong-tie 12-gauge angle' u'angle bracket'] [ 2.    1.    1.25  1.    0.    0.  ]
[u'3' u'100001' u'simpson strong-tie 12-gauge angle' u'l bracket'] [ 2.    1.    0.65  1.    0.    0.  ]
(63067, 6)


# More Feature Engineering

Now we do the same thing for the dev data

In [11]:
# now do the same for the dev data
dev_data = np.zeros((X_dev.shape[0],num_features))

for x, d in zip(X_dev, dev_data):
    pid = x[1]
    title = stem_data(x[2])
    query = stem_data(x[3])
    description = stem_data(pd_dict[int(pid)])
    
    d[num_query] = count_words(query)
    d[num_title] = find_query_in_string(query, title)
    d[sum_title] = sum_similar(query, title)
    d[num_desc] = find_query_in_string(query, description)
    
    if int(pid) in att_dict:
        inner_dict = att_dict[int(pid)]
        brand_idx = "MFG Brand Name"
        if brand_idx in inner_dict:
            d[num_brand] += find_query_in_string(query, stem_data(inner_dict[brand_idx]))
            d[sum_brand] = sum_similar(query, stem_data(inner_dict[brand_idx]))

for t in range(2):
    print X_dev[t], dev_data[t]

print dev_data.shape

[u'190119' u'181636'
 u'suntouch floor warming 48 ft. x 30 in. 240 v radiant floor warming mat'
 u'floor warming matt'] [ 3.          2.          2.85714286  2.          0.          0.        ]
[u'190120' u'181637'
 u'brinks home security 1-13/16 in. (45 mm) laminated steel padlock with 2 in. shackle'
 u'security padlock'] [ 2.  2.  2.  1.  0.  0.]
(11000, 6)


# More Feature Engineering

And finally the test data.

In [12]:
# Now do the same for the test data
test_data = np.zeros((X_test.shape[0],num_features))

for x, t in zip(X_test, test_data):
    pid = x[1]
    title = stem_data(x[2])
    query = stem_data(x[3])
    description = stem_data(pd_dict[int(pid)])

    t[num_query] = count_words(query)
    t[num_title] = find_query_in_string(query, title)
    t[sum_title] = sum_similar(query, title)
    t[num_desc] = find_query_in_string(query, description)
    
    if int(pid) in att_dict:
        inner_dict = att_dict[int(pid)]
        brand_idx = "MFG Brand Name"
        if brand_idx in inner_dict:
            t[num_brand] += find_query_in_string(query, stem_data(inner_dict[brand_idx]))
            t[sum_brand] = sum_similar(query, stem_data(inner_dict[brand_idx]))

for t in range(2):
    print X_test[t], test_data[t]

print test_data.shape

[u'1' u'100001' u'simpson strong-tie 12-gauge angle' u'90 degree bracket'] [ 3.          0.          0.47222222  1.          0.          0.        ]
[u'4' u'100001' u'simpson strong-tie 12-gauge angle' u'metal l brackets'] [ 3.          1.          1.09444444  1.          0.          0.        ]
(166693, 6)


# First Model

Trained a KNN model as a first attempt to get something in a submittable format. Never performed above 24%, which makes sense given what we learned later about our data set.

In [13]:
#fit a knn classifier
k_values = np.arange(1, 21, 2)
knn_f1_scores = np.zeros(k_values.shape)
for i in range(k_values.shape[0]):
    clf = KNeighborsClassifier(n_neighbors = k_values[i])
    clf.fit(train_data, Y)
    preds = clf.predict(dev_data)
    knn_f1_scores[i] = metrics.f1_score(Y_dev, preds, average='weighted')

k_val = k_values[np.argmax(knn_f1_scores)]
print 'Optimal value of k is: ', k_val


Optimal value of k is:  11


  'precision', 'predicted', average, warn_for)


In [14]:
clf = KNeighborsClassifier(n_neighbors = 11)
clf.fit(train_data, Y)
preds = clf.predict(test_data)

print 'KNN accuracy: %3.2f' %clf.score(dev_data, Y_dev)

print X_test.shape
print preds.shape

KNN accuracy: 0.24
(166693, 4)
(166693,)


# Logistic Regression

Tried LR with a variety of parameters with minimal success. Since we learn later that our data is just not linearly separable, LR had no chance of being relevant.

In [15]:
train_labels_log = Y * 3 - 3
dev_labels_log = Y_dev * 3 - 3

train_labels_log = np.array(train_labels_log, dtype=int)
dev_labels_log = np.array(dev_labels_log, dtype=int)

clf = LogisticRegression(C=0.1)
clf.fit(train_data, train_labels_log)
preds = clf.predict(dev_data)
print metrics.f1_score(dev_labels_log, preds, average='weighted')

preds = clf.predict(test_data)

log_preds = np.array(preds, dtype=np.float32)

log_preds = (log_preds + 3)/3

print log_preds.shape


TypeError: ufunc 'multiply' did not contain a loop with signature matching types dtype('<U147') dtype('<U147') dtype('<U147')

# Decision Trees

Tried various decision trees with various boosting techniques, but was never able to achieve much better than KNN.

In [None]:
dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, Y)

print 'Accuracy (a decision tree):', dt.score(dev_data, Y_dev)

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train_data, Y)

print 'Accuracy (a random forest):', rfc.score(dev_data, Y_dev)

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)

abc.fit(train_data, Y)
print 'Accuracy (adaboost with decision trees):', abc.score(dev_data, Y_dev)

abc = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, learning_rate=0.1)

abc.fit(train_data, Y)
print 'Accuracy (adaboost with random forest):', abc.score(dev_data, Y_dev)

# Bagging Regressor

Started to show some promise with the bagging regressor, but still not able to beat the mean baseline.

In [None]:
from sklearn.ensemble import BaggingRegressor

num_e = np.arange(10,500,5)
score = np.zeros(num_e.shape)

for e, i in zip(num_e, range(score.shape[0])):
    br = BaggingRegressor(base_estimator=None, n_estimators=e)
    br.fit(train_data, Y)
    score[i] = br.score(dev_data, Y_dev)
    
    print 'BaggingRegressor n_estimators: %3d score: %4f' %(e, score[i])
    
print 'Best score is: ', np.amax(score)
print 'Num estimators: ', num_e[np.argmax(score)]

# Random Forest Regressor

Showed some promise, but was never able to beat the mean baseline by itself

In [None]:
from sklearn import pipeline, grid_search
from sklearn.metrics import mean_squared_error, make_scorer

num_e = np.arange(10,500,5)
score = np.zeros(num_e.shape)

for e, i in zip(num_e, range(score.shape[0])):
    rfr = RandomForestRegressor(n_jobs = -1, n_estimators=e)
    rfr.fit(train_data, Y)

    score[i] = rfr.score(dev_data, Y_dev)
    
    print 'RandomForestRegressor n_estimators: %3d score: %4f' %(e, score[i])

print 'Best score is: ', np.amax(score)
print 'Num estimators: ', num_e[np.argmax(score)]


In [None]:
rfr = RandomForestRegressor(n_jobs = -1, n_estimators=135)
rfr.fit(train_data, Y)

preds = rfr.predict(test_data)

for i in range(10):
    print X_test[i], preds[i]

# Bagging Regressor with Random Forest Regressors

Finally able to beat the mean baseline with a bunch of bagging regressors with RFs instead of the default decsion trees. This ended up being our top performing model, though we tried several others as you see below.

In [None]:
from sklearn.ensemble import BaggingRegressor

br_est = [100,150,200,250,300,350]

for b in br_est:
    rf = RandomForestRegressor(n_estimators=20, random_state=0)
    clf = BaggingRegressor(base_estimator=rf, n_estimators=b, max_samples=0.1, random_state=10)
    clf.fit(train_data, Y)


    score = clf.score(dev_data, Y_dev)

    print 'BaggingRegressor n_estimators: %3d RF estimators: %3d score: %4f' %(b, 20, score)


# Top Performer

In [None]:
rf_est = 20
br_est = 300

rf = RandomForestRegressor(n_estimators=rf_est, random_state=0)
clf = BaggingRegressor(base_estimator=rf, n_estimators=br_est, max_samples=0.1, random_state=10)
clf.fit(train_data, Y)
preds = clf.predict(test_data)
score = clf.score(dev_data, Y_dev)

print 'BaggingRegressor n_estimators: %3d RF estimators: %3d score: %4f' %(br_est, rf_est, score)

# Neural Networks

Tried a variety of NN techniques, but once again none were able to beat the mean baseline. Perhaps deeper nets would have worked better, but I lacked the computational power to go much further.

In [None]:
import theano 
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

print theano.config.device # We're using CPUs (for now)
print theano.config.floatX # Should be 64 bit for CPUs

import time

np.random.seed(0)



In [None]:
def binarizeY(data):
    binarized_data = np.zeros((data.size,13))
    for j in range(0,data.size):
        feature = data[j:j+1]
        i = feature.astype(np.int64) 
        binarized_data[j,i]=1
    return binarized_data

Y_b = Y*6 - 5.95
Y_dev_b = Y_dev*6 - 5.95

Y_b = np.array(Y_b, dtype=int)
Y_dev_b = np.array(Y_dev_b, dtype=int)

train_labels_b = binarizeY(Y_b)
dev_labels_b = binarizeY(Y_dev_b)
numClasses = train_labels_b[1].size
numFeatures = train_data[1].size
print 'Classes = %d' %(numClasses)



In [None]:
## (1) Parms
numHiddenNodes = 1000 
w_1 = theano.shared(np.asarray((np.random.randn(*(numFeatures, numHiddenNodes))*.01)))
w_2 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodes, numHiddenNodes))*.01)))
w_3 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodes, numHiddenNodes))*.01)))
w_4 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodes, numClasses))*.01)))
params = [w_1, w_2, w_3, w_4]

## (2) Model
Xm = T.matrix()
Ym = T.matrix()
def model(Xm, w_1, w_2, w_3, w_4):
    return T.nnet.softmax(T.dot(T.nnet.sigmoid(T.dot(T.nnet.sigmoid(T.dot(T.nnet.sigmoid(T.dot(Xm, w_1)), w_2)), w_3)), w_4))
y_hat = model(Xm, w_1, w_2, w_3, w_4)


## (3) Cost...same as logistic regression
cost = T.mean(T.nnet.categorical_crossentropy(y_hat, Ym))

## (4) Minimization.  Update rule changes to backpropagation.
alpha = 0.01
def backprop(cost, w):
    grads = T.grad(cost=cost, wrt=w)
    updates = []
    for w1, grad in zip(w, grads):
        updates.append([w1, w1 - grad * alpha])
    return updates
update = backprop(cost, params)
train = theano.function(inputs=[Xm, Ym], outputs=cost, updates=update, allow_input_downcast=True)
y_pred = T.argmax(y_hat, axis=1)
predict = theano.function(inputs=[Xm], outputs=y_pred, allow_input_downcast=True)

miniBatchSize = 1 
def gradientDescentStochastic(epochs):
    trainTime = 0.0
    predictTime = 0.0
    start_time = time.time()
    for i in range(epochs):
        for start, end in zip(range(0, len(train_data), miniBatchSize), range(miniBatchSize, len(train_data), miniBatchSize)):
            cost = train(train_data[start:end], train_labels_b[start:end])
        trainTime =  trainTime + (time.time() - start_time)
        print '%d) accuracy = %.4f' %(i+1, np.mean(np.argmax(dev_labels_b, axis=1) == predict(dev_data)))
    print 'train time = %.2f' %(trainTime)

gradientDescentStochastic(500)

start_time = time.time()
predict(test_data)   
print 'predict time = %.2f' %(time.time() - start_time)


# GMM and PCA

After a little data exploration on the different classes we had in the labelled data, we finally get a look at how muddled these data are. PCA captures >90% of the variance in 2 components, and you can see from the plot that clustering will not produce very good results.

In [None]:
print min(Y), max(Y)

Y1 = Y[Y==1]
Y125 = Y[Y==1.25]
Y133 = Y[Y==1.33]
Y15 = Y[Y==1.5]
Y167 = Y[Y==1.67]
Y175 = Y[Y==1.75]
Y2 = Y[Y==2]
Y225 = Y[Y==2.25]
Y233 = Y[Y==2.33]
Y25 = Y[Y==2.5]
Y267 = Y[Y==2.67]
Y275 = Y[Y==2.75]
Y3 = Y[Y==3]

#for i in range(100):
#   print Y[i]

plt.hist(Y, normed=1, facecolor='green', alpha=0.75)
plt.show()

print Y.shape
print Y1.shape[0] + Y125.shape[0] + Y133.shape[0] + Y15.shape[0] + Y167.shape[0] + Y175.shape[0] + Y2.shape[0] + Y225.shape[0] + Y233.shape[0] + Y25.shape[0] + Y267.shape[0] + Y275.shape[0] + Y3.shape[0]

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

pca = PCA(n_components=2)
X = pca.fit_transform(train_data)

print(pca.explained_variance_ratio_)

clusters = 5

km = KMeans(n_clusters=clusters)
Yhat = km.fit_predict(X)

# Plot the training data
plt.figure(clusters, figsize=(10,10))
plt.scatter(X[:,0], X[:,1], c=Y)
plt.title('Depot Data Clustered by KMeans n=%2d' %clusters)

plt.show()

# GMM and PCA continued

And indeed, they did not...

In [None]:
from sklearn.mixture import GMM

pca = PCA(n_components=2)
X = pca.fit_transform(train_data)
test_data_X = pca.transform(test_data)

buckets = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 1.75, 3]

train_data1 = X[Y==buckets[0]]
train_data2 = X[Y==buckets[1]]
train_data3 = X[Y==buckets[2]]
train_data4 = X[Y==buckets[3]]
train_data5 = X[Y==buckets[4]]
train_data6 = X[Y==buckets[5]]
train_data7 = X[Y==buckets[6]]
train_data8 = X[Y==buckets[7]]
train_data9 = X[Y==buckets[8]]
train_data10 = X[Y==buckets[9]]
train_data11 = X[Y==buckets[10]]
train_data12 = X[Y==buckets[11]]
train_data13 = X[Y==buckets[12]]

num_comps = 2
#fit 13 GMM models, one for each bucket
clf_1 = GMM(n_components=num_comps, covariance_type='full')
clf_1.fit(train_data1)
clf_2 = GMM(n_components=num_comps, covariance_type='full')
clf_2.fit(train_data2)
clf_3 = GMM(n_components=num_comps, covariance_type='full')
clf_3.fit(train_data3)
clf_4 = GMM(n_components=num_comps, covariance_type='full')
clf_4.fit(train_data4)
clf_5 = GMM(n_components=num_comps, covariance_type='full')
clf_5.fit(train_data5)
clf_6 = GMM(n_components=num_comps, covariance_type='full')
clf_6.fit(train_data6)
clf_7 = GMM(n_components=num_comps, covariance_type='full')
clf_7.fit(train_data7)
clf_8 = GMM(n_components=num_comps, covariance_type='full')
clf_8.fit(train_data8)
clf_9 = GMM(n_components=num_comps, covariance_type='full')
clf_9.fit(train_data9)
clf_10 = GMM(n_components=num_comps, covariance_type='full')
clf_10.fit(train_data10)
clf_11 = GMM(n_components=num_comps, covariance_type='full')
clf_11.fit(train_data11)
clf_12 = GMM(n_components=num_comps, covariance_type='full')
clf_12.fit(train_data12)
clf_13 = GMM(n_components=num_comps, covariance_type='full')
clf_13.fit(train_data13)

#calculate scores for both the positive and the negative GMM models for the 2D projected test data
scores1 = clf_1.score(test_data_X)
scores2 = clf_2.score(test_data_X)
scores3 = clf_3.score(test_data_X)
scores4 = clf_4.score(test_data_X)
scores5 = clf_5.score(test_data_X)
scores6 = clf_6.score(test_data_X)
scores7 = clf_7.score(test_data_X)
scores8 = clf_8.score(test_data_X)
scores9 = clf_9.score(test_data_X)
scores10 = clf_10.score(test_data_X)
scores11 = clf_11.score(test_data_X)
scores12 = clf_12.score(test_data_X)
scores13 = clf_13.score(test_data_X)

scores = np.array(zip(scores1, scores2, scores3, scores4, scores5, scores6, scores7, scores8, scores9, scores10, scores11, scores12, scores13))

scores = np.exp(scores)

n_scores = np.zeros(scores.shape)
#normalize the scores:
for i in range(scores.shape[0]):
    total = np.sum(scores[i])
    
    for j in range(scores.shape[1]):
        n_scores[i][j] = scores[i][j] / total

for i in range(10):
    print scores[i], n_scores[i]

preds = np.dot(n_scores, buckets)


In [None]:
print min(preds), max(preds)

# File Output

Running this cell would output the preds array to a csv file in the proper format for submission.

In [None]:
test_lab_f = open("test_labeled.csv", "w") # you will need to edit this directory

test_lab_f.write("\"id\",\"relevance\"\n")
                 
for pred, pid in zip(preds, X_test):
    test_lab_f.write(str(pid[0]) + "," + str(pred) + "\n")

test_lab_f.close()

# Multinomial Naive Bayes w/ TF-IDF Weighting

Different code base :)

In [None]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# other useful libraries
import csv
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
#nltk.download()

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [None]:
# open the train data file
with open('train.csv', 'rb') as csvfile:
    fh = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    data = np.asarray([row for row in fh])
    df1 = pd.DataFrame({
            'pid': data[:,1],
            'score': data[:,4],
            'query': data[:,3],
            'title': data[:,2]
        }, index=data[:,0])

with open('product_descriptions.csv', 'rb') as csvfile:
    fh = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    data = np.asarray([row for row in fh])
    df2 = pd.DataFrame({
            'desc': data[1:,1]
        }, index=data[1:,0])

with open('attributes.csv', 'rb') as csvfile:
    fh = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    data = np.asarray([row for row in fh])
    df3 = pd.DataFrame({
            'attr_name': data[:,1],
            'attr_value': data[:,2]
        }, index=data[:,0])

In [None]:
df = df1.merge(df2,left_on='pid',right_index=True)
df['text'] = df['title'] + ' ' + df['desc']
dev_data = df[1:10001]
train_data = df[10001:]

In [None]:
def cleanDoc(doc):
    # identify stopwords
    stopset = set(stopwords.words('english'))
    # use a snowball stemmer from nltk to remove word endings
    stemmer = SnowballStemmer('english')
    #Remove punctuation,convert lower case and split into seperate words
    tokens = re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", doc.lower() ,flags = re.UNICODE | re.LOCALE)
    #Remove stopwords and words < 2
    clean = [token for token in tokens if token not in stopset and len(token) > 2]
    #Stemming
    stemmed = [stemmer.stem(word) for word in clean]
    final = ' '.join(str(x) for x in stemmed)
    return final

train_query = map(lambda x: cleanDoc(x.decode('ISO-8859-1')),train_data['query'])
train_text = map(lambda x: cleanDoc(x.decode('ISO-8859-1')),train_data['text'])
# Since Multinomial Naive Bayes expects integers, we must round our data. This also acts as a form of regularization.
train_score = map(lambda x: int(round(float(x))),train_data['score'])

dev_query = map(lambda x: cleanDoc(x.decode('ISO-8859-1')),dev_data['query'])
dev_text = map(lambda x: cleanDoc(x.decode('ISO-8859-1')),dev_data['text'])
dev_score = map(lambda x: int(round(float(x))),dev_data['score'])

In [None]:
def match_words(query, string):
    matched_words = ''
    for word in query.lower().split():
        for k in range(string.lower().count(word)):
            matched_words += ' '+word
    return matched_words[1:]

v1 = TfidfVectorizer(min_df=1)
train = v1.fit_transform(map(lambda x: match_words(x[0],x[1]),zip(train_query,train_text)))

dev = v1.transform(map(lambda x: match_words(x[0],x[1]),zip(dev_query,dev_text)))

In [None]:
v1 = TfidfVectorizer(min_df=1)
train = v1.fit_transform(map(lambda x: match_words(x[0],x[1]),zip(train_query,train_text)))

dev = v1.transform(map(lambda x: match_words(x[0],x[1]),zip(dev_query,dev_text)))

a_vec = [0.0001,0.001,0.01,0.1,0.5,0.8,1.1,2,5,10]
acc_vec = []
rmse_vec = []
for a in a_vec:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(train,train_score)
    acc_vec.append(metrics.accuracy_score(dev_score,mnb.predict(dev)))
    rmse_vec.append(metrics.mean_squared_error(dev_score,mnb.predict(dev))**0.5)
    
plt.figure(figsize=(12,12))
plt.subplot(1,2,1)
plt.plot(a_vec,acc_vec)
plt.subplot(1,2,2)
plt.plot(a_vec,rmse_vec)

print 'Best Model: alpha = ', a_vec[np.argmin(rmse_vec)], ' RMSE: ', np.amin(rmse_vec), ' Accuracy: ', acc_vec[np.argmin(rmse_vec)]