# 50 Shades of Text  - Leveraging Natural Language Processing

## Notebook explained during the talk I gave in June 2018 available at the following link:
## [Video](https://youtu.be/M6U_YrnWIa8?t=224)
## Data used for this notebook: 
## [Dataset](https://www.kaggle.com/c/home-depot-product-search-relevance/data)
## [Wikipedia GloVe Word Embeddings](http://nlp.stanford.edu/data/glove.6B.zip)

In [3]:
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten,Input,InputLayer,Reshape
from keras.layers import Convolution1D, MaxPooling1D
from keras.utils import np_utils
from keras.layers.convolutional import Conv1D,Conv2D
from keras.layers import GRU, LSTM
from keras.layers.convolutional import MaxPooling1D,MaxPooling2D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler
import gensim
import keras
import warnings
from gensim.models import KeyedVectors
warnings.filterwarnings('ignore',category=FutureWarning)

In [4]:
# Util function to generate ngrams given a sentence

def word2ngrams(text, ngrams = 4):
    return ["".join(j) for j in zip(*[text[i:] for i in range(ngrams)])]

In [5]:
# Read File training and attributes and extracting the category property

np.random.seed(32)
#Reading training set
df = pd.read_csv('data/train.csv' , sep=',' , encoding='latin-1')
df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [6]:
# df with attributes

df_attr = pd.read_csv('data/attributes.csv' , sep=',' , encoding='latin-1')
df_attr.head()

Unnamed: 0,product_uid,name,value
0,100001.0,Bullet01,Versatile connector for various 90Â° connectio...
1,100001.0,Bullet02,Stronger than angled nailing or screw fastenin...
2,100001.0,Bullet03,Help ensure joints are consistently straight a...
3,100001.0,Bullet04,Dimensions: 3 in. x 3 in. x 1-1/2 in.
4,100001.0,Bullet05,Made from 12-Gauge steel


In [7]:
# join the two datasets

df_with_label = df_attr[df_attr['name'] == 'Tools Product Type']
print('*'*50)
print('Count df with label',len(df_with_label))
df_with_label.head()

**************************************************
Count df with label 6169


Unnamed: 0,product_uid,name,value
846,100037.0,Tools Product Type,Hand Tool
879,100038.0,Tools Product Type,Power Tool
1715,100073.0,Tools Product Type,Tool Storage
2322,100093.0,Tools Product Type,Power Tool
2526,100099.0,Tools Product Type,Power Tool


In [8]:
# Join the 2 dataset to create training data

joined = pd.merge(df, df_with_label, on='product_uid', how='inner')[['search_term','value']]
print('Dataset size: ',len(joined))
len(joined.value.unique())

lb_make = LabelEncoder()
joined["cat"] = lb_make.fit_transform(joined["value"])
joined.head()

Dataset size:  3991


Unnamed: 0,search_term,value,cat
0,husky tool bag,Hand Tool,5
1,impact driver drill battery powered,Power Tool,8
2,impact wrench,Power Tool,8
3,milwaukee right angle,Power Tool,8
4,milwaukee stone hole drill,Power Tool,8


In [9]:
# Creating list of searches, ngrams and labels

num_of_ngrams = []
search_list = []
n_grams = []
labels = []
shuffled_df = shuffle(joined)

for i, row in shuffled_df.iterrows():
    search_term =row['search_term']
    ngrammed = word2ngrams(row['search_term'])
    
    if(i==1):
        print('Example of search query:',search_term)
        print('Example of ngrams: ',ngrammed)
    if (len(search_term) > 0):
        n_grams.append(ngrammed)
        num_of_ngrams.append(len(ngrammed))
        search_list.append(search_term)
        labels.append(row['cat'])


Example of search query: impact driver drill battery powered
Example of ngrams:  ['impa', 'mpac', 'pact', 'act ', 'ct d', 't dr', ' dri', 'driv', 'rive', 'iver', 'ver ', 'er d', 'r dr', ' dri', 'dril', 'rill', 'ill ', 'll b', 'l ba', ' bat', 'batt', 'atte', 'tter', 'tery', 'ery ', 'ry p', 'y po', ' pow', 'powe', 'ower', 'were', 'ered']


In [10]:
# Search list vectorization via tf-idf

vectorizer = TfidfVectorizer(stop_words='english')
# tokenize and build vocab
vectorizer.fit(search_list)
# summarize
#print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform([search_list[0]])
# summarize encoded vector
print(search_list[0])
print(vector.shape)
print(vector.toarray())

pipe saver clamp
(1, 1210)
[[0. 0. 0. ... 0. 0. 0.]]


In [11]:
# Training and Testing split

M = vectorizer.transform(search_list).toarray()
train_len = int(len(labels) * 0.8)
X_train_tf = M[:train_len]
search_list_train = search_list[:train_len]
n_grams_train = n_grams[:train_len]
y_train_tf = labels[:train_len]
X_test_tf = M[train_len:]
search_list_test = search_list[train_len:]
n_grams_test = n_grams[train_len:]
y_test_tf = labels[train_len:]
print('Train size:',len(X_train_tf))
# print(len(y_train_tf))
print('Test size:',len(X_test_tf))
# print(len(y_test_tf))
print(X_train_tf[0])
print(y_train_tf[0])

Train size: 3192
Test size: 799
[0. 0. 0. ... 0. 0. 0.]
5


In [12]:
# Code to convert the data in a fasttext input format

fasttext = False
if (fasttext):
    lab = '__label__'
    joined['labs'] = lab + joined['cat'].astype(str)

    # msk = np.random.rand(len(joined)) < 0.9
    train = pd.DataFrame({'search_term':search_list_train,'labs':y_train_tf})
    train['labs'] = lab + joined['cat'].astype(str)
    test = pd.DataFrame({'search_term':search_list_test,'labs':y_test_tf})
    test['labs'] = lab + joined['cat'].astype(str)
    train[['search_term','labs']].to_csv('data/training_ft.tsv', sep='\t',index=False,header=False)
    test[['search_term','labs']].to_csv('data/test_ft.tsv', sep='\t',index=False,header=False)

    # ngrams
    n_grams_train_flattened =[]
    for ns in n_grams_train:
        n_grams_train_flattened.append(' '.join(ns))
    n_grams_test_flattened =[]
    for ns in n_grams_test:
        n_grams_test_flattened.append(' '.join(ns))

    train_ngrams = pd.DataFrame({'ngrams':n_grams_train_flattened,'labs':y_train_tf})
    train_ngrams['labs'] = lab + joined['cat'].astype(str)
    test_ngrams = pd.DataFrame({'ngrams':n_grams_test_flattened,'labs':y_test_tf})
    test_ngrams['labs'] = lab + joined['cat'].astype(str)
    train_ngrams[['ngrams','labs']].to_csv('data/training_ngrams_ft.tsv', sep='\t',index=False,header=False)
    test_ngrams[['ngrams','labs']].to_csv('data/test_ngrams_ft.tsv', sep='\t',index=False,header=False)

    print(len(train))
    print(len(test))

In [13]:
# Fit the data in 3 different models and compare results:
# 1-SVM
# 2-NB
# 3-RF


# print(filtered_labels_train[0])
clf = SGDClassifier().fit(X=X_train_tf,y=y_train_tf)
predicted_sgdc = clf.predict(X_test_tf)
# print(predicted_sgdc)
print('SGDC Accuracy: %.2f %%'%round(np.mean(predicted_sgdc == y_test_tf)*100,2))

nb = MultinomialNB().fit(X_train_tf, y_train_tf)
predicted = nb.predict(X_test_tf)
# print(predicted)
print('NB Accuracy: %.2f %%'%round(np.mean(predicted == y_test_tf)*100,2))

# print(filtered_labels_train[0])
# clf = RandomForestClassifier(n_estimators=500,class_weight='balanced').fit(X=X_train_tf,y=y_train_tf)
clf = RandomForestClassifier(n_estimators=500).fit(X=X_train_tf,y=y_train_tf)
random_pred= clf.predict(X_test_tf)
# print(predicted_sgdc)
print('RF Accuracy: %.2f %%'%round(np.mean(random_pred== y_test_tf)*100,2))

SGDC Accuracy: 85.48 %
NB Accuracy: 82.23 %
RF Accuracy: 84.61 %


In [14]:
# load pretrained GloVe vectors

gloveFile = 'data/glove.6B/glove.6B.50d.txt'
filename = 'data/glove.6B.100d.txt.word2vec'
model_gensim = KeyedVectors.load_word2vec_format(filename, binary=False)

f = open(gloveFile,'r')
model = {}
for line in f:
    splitLine = line.split()
    word = splitLine[0]
    embedding = np.array([float(val) for val in splitLine[1:]])
    model[word] = embedding    

In [15]:
# king - man + woman

result = model_gensim.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
print('*'*10)

chosen_word = 'radiohead'
print(model_gensim.most_similar(chosen_word)[:1])

[('queen', 0.8523603677749634)]
**********
[('r.e.m.', 0.8340362906455994)]


In [16]:
# average GloVe 

vecs_train = [] 
labs_train = []
for s,lb in zip(search_list[:train_len],labels[:train_len]):
    t = []
    pnt = 0
    for w in s.split():
        try:
            t.append(model[w.lower()])
             
        except KeyError as e:
            continue
    try:
        len(np.average(t,axis=0))
        vecs_train.append(np.average(t,axis=0))
        labs_train.append(lb)        
    except:
        print(s) 

# Training and Testing split

X_train = np.array(vecs_train)
y_train = np.array(labs_train)


vecs_test = [] 
labs_test = []
for s,lb in zip(search_list[train_len:],labels[train_len:]):
    t = []
    pnt = 0
    for w in s.split():
        try:
            t.append(model[w.lower()])
             
        except KeyError as e:
            continue
    try:
        len(np.average(t,axis=0))
        vecs_test.append(np.average(t,axis=0))
        labs_test.append(lb)        
    except:
        print(s) 

X_test = np.array(vecs_test)
y_test = np.array(labs_test)

sodpstone
bateries
ridgid multitool
aspiradora
roybi l18v
roybi l18v
handtools
handtools
rebarbender
sawall
drumel
roybi l18v
rebarbender
come-along
handtools
taladros
roybi l18v
taladros
respine
roybi l18v
ni-2.4v
upholstry
roybi l18v
upholstry
drumel
bernzomatic
carrrs
roybi l18v
hagchet
drils
tji
hagchet
drils
roybi l18v
taladros
sawall
inclinometer
sandpap
roybi l18v
insallation
susbenders
drils
sawall
drils
bernzomatic
handtools
bernzomatic
roybi l18v
handtools
handtools
bernzomatic
bateries
drils
taladros
handtools
bateries
ni-2.4v
tji
handtools
handtools
phillits


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [17]:
# Same models with GloVe

clf = SGDClassifier().fit(X=X_train,y=y_train)
predicted_sgdc = clf.predict(X_test)
# print(predicted_sgdc)
print('SGDC Accuracy: %.2f %%'%round(np.mean(predicted_sgdc == y_test)*100,2))

scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
scaler.fit(X_test)
X_test_scaled = scaler.transform(X_test)

nb = MultinomialNB().fit(X_train_scaled, y_train)
predicted_nb = nb.predict(X_test_scaled)
print('NB Accuracy: %.2f %%'%round(np.mean(predicted_nb == y_test)*100,2))


# print(filtered_labels_train[0])
# clf = RandomForestClassifier(n_estimators=500,class_weight='balanced').fit(X=X_train,y=y_train)
clf = RandomForestClassifier(n_estimators=500).fit(X=X_train,y=y_train)
random_pred= clf.predict(X_test)
print('RF Accuracy: %.2f %%'%round(np.mean(random_pred== y_test)*100,2))


SGDC Accuracy: 56.01 %
NB Accuracy: 64.71 %
RF Accuracy: 83.76 %


In [18]:
# reshape data in 3 dimensions (len, 10, 50)

vecs_nn = [] 
labs_nn = []
empty = np.zeros(50)
max_len = 10
for s,lb in zip(search_list,labels):
    t = []
    pnt = 0
    for w in s.split():
        try:
            #print(w)
            t.append(model[w.lower()])
            #vec = model[w]
             
        except KeyError as e:
#            print(str(e) + ' not found')
            continue
    for f in range(0,max_len - len(t)):
        t.append(empty)
#    print(t)
    vecs_nn.append(np.array(t))
    labs_nn.append(lb)

X = np.array(vecs_nn)
print(X.shape)
Y = np.array(labs_nn)
print(Y.shape)
num_categories = len(np.unique(labs_nn))
Y_labs = keras.utils.to_categorical(Y,num_classes = num_categories)
print(Y_labs.shape)

(3991, 10, 50)
(3991,)
(3991, 12)


In [19]:
# LSTM + CNN

def create_lstm_conv_model():
    model_conv = Sequential()
    model_conv.add(InputLayer(input_shape=(10,50)))
    model_conv.add(LSTM(100,dropout=0.1))
    print()
    model_conv.add(Reshape((5,4,5,)))
    model_conv.add(Conv2D(4, 2, activation='relu'))
    model_conv.add(MaxPooling2D(pool_size=2))    
#    model_conv.add(Dense(num_categories, activation='relu'))
    model_conv.add(Flatten())
    model_conv.add(Dense(num_categories, activation='softmax'))
    model_conv.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model_conv
                   
model_conv = create_lstm_conv_model()
print(model_conv.summary())
model_conv.fit(X, Y_labs, validation_split=0.2, epochs = 10,shuffle=True)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
reshape_1 (Reshape)          (None, 5, 4, 5)           0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 4, 3, 4)           84        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 2, 1, 4)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                108       
Total par

<keras.callbacks.History at 0x1a2b22ae80>

In [20]:
# CNN + LSTM

def create_conv_lstm_model():
    model_conv = Sequential()
    model_conv.add(Dense(500, activation='relu', input_shape=(10,50)))                
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(100, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=5))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(num_categories, activation='softmax'))
    model_conv.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model_conv
model_conv = create_conv_lstm_model()
print(model_conv.summary())
model_conv.fit(X, Y_labs, validation_split=0.2, epochs = 10,shuffle=True)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 10, 500)           25500     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 500)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 6, 100)            250100    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 100)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 12)                1212      
Total params: 357,212
Trainable params: 357,212
Non-trainable params: 0
_________________________________________________________________
None

<keras.callbacks.History at 0x1a4d575cc0>

In [21]:
# NOT USING PRETRAINED VECTORS LSTM

Y_labs_tf = np.array(y_train_tf)
Y_labs_onehot_tf = keras.utils.np_utils.to_categorical(Y_labs_tf)
input_shape_c = X_train[0].shape

def create_conv_model():
    model_conv = Sequential()
    model_conv.add(InputLayer(input_shape=(1,1210)))
    
    model_conv.add(LSTM(100,dropout=0.1))
    
    model_conv.add(Dense(num_categories, activation='softmax'))
    model_conv.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model_conv
                   
model_conv = create_conv_model()
print(model_conv.summary())
model_conv.fit(X_train_tf.reshape(len(X_train_tf),1,1210),Y_labs_onehot_tf, validation_split=0.2, epochs = 10,shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1, 1210)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               524400    
_________________________________________________________________
dense_4 (Dense)              (None, 12)                1212      
Total params: 525,612
Trainable params: 525,612
Non-trainable params: 0
_________________________________________________________________
None
Train on 2553 samples, validate on 639 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a42ca59e8>

In [22]:
# NOT USING PRETRAINED VECTORS CNN-LSTM

Y_labs_tf = np.array(y_train_tf)
Y_labs_onehot_tf = keras.utils.np_utils.to_categorical(Y_labs_tf)
input_shape_c = X_train[0].shape

def create_conv_lstm_model():
    model_conv = Sequential()
    model_conv.add(Dense(500, activation='relu', input_shape=(1,1210)))
    model_conv.add(Reshape((50, 10)))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(1000, 2, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=2))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(num_categories, activation='softmax'))
    model_conv.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model_conv
model_conv = create_conv_lstm_model()
print(model_conv.summary())
model_conv.fit(X_train_tf.reshape(len(X_train_tf),1,1210),Y_labs_onehot_tf, validation_split=0.2, epochs = 10,shuffle=True)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 1, 500)            605500    
_________________________________________________________________
reshape_2 (Reshape)          (None, 50, 10)            0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 10)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 49, 1000)          21000     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 24, 1000)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               440400    
_________________________________________________________________
dense_6 (Dense)              (None, 12)                1212      
Total para

<keras.callbacks.History at 0x1a432f5ba8>