# Final Project Group: Amy Edwards, William Chirciu
# CSC 575 - Online 801
# March 17,2019

In [2]:
# import the packages needed
import re, math, string
from collections import Counter
import pandas as pd
import nltk,csv
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords, wordnet
from gensim.scripts.glove2word2vec import glove2word2vec
read_in_csim = 0    # 1 if want to read in saved cosine similarities w/o rerunning algorithms



In [81]:
#  read in the training and testing data from Kaggle
train = pd.read_csv('train_new.csv', delimiter = '\t', header = 0)
test = pd.read_csv('test_new.csv', delimiter = '\t', header = 0)

### Read in Document Indexes     (DocId -> raw term frequency)

In [230]:
title_doc_index = pd.read_csv('title_doc_index_revised.csv', delimiter = '\t', header = None, names = ["product_uid","raw_tf"])
description_doc_index = pd.read_csv('description_doc_index_revised.csv', delimiter = '\t', header = None, names = ["product_uid","raw_tf"])
attribute_doc_index = pd.read_csv('attribute_doc_index_revised.csv',delimiter = '\t', header = None, names = ['product_uid',"raw_tf"])

In [231]:
title_dl = {}
desc_dl = {}
att_dl = {}
for index, row in title_doc_index.iterrows():
    pId = row['product_uid']
    title_dl[pId] = row['raw_tf']
for index, row in description_doc_index.iterrows():
    pId = row['product_uid']
    desc_dl[pId] = row['raw_tf']
for index, row in attribute_doc_index.iterrows():
    pId = row['product_uid']
    att_dl[pId] = row['raw_tf']

### Read in Document tfxidf Vector Lengths       (DocID -> tfxIDF vector lengths)

In [232]:
title_tfidf = pd.read_csv('title_tfidf_revised.csv', delimiter = '\t', header = None, names = ["product_uid","tfxidf"])
description_tfidf = pd.read_csv('description_tfidf_revised.csv', delimiter = '\t', header = None, names = ["product_uid","tfxidf"])
attribute_tfidf = pd.read_csv('attribute_tfidf_revised.csv',delimiter = '\t', header=None, names = ["product_uid","tfxidf"])

In [233]:
T_tfidf = {}                              # Title Documents tfxidf vector lengths
D_tfidf = {}                              # Description Documents tfxidf vector lengths
A_tfidf = {}                              # Attribute Documents tfxidf vector lengths
for index, row in title_tfidf.iterrows():
    pId = row['product_uid']
    T_tfidf[pId] = row['tfxidf']
for index, row in description_tfidf.iterrows():
    pId = row['product_uid']
    D_tfidf[pId] = row['tfxidf']
for index, row in attribute_tfidf.iterrows():
    pId = row['product_uid']
    A_tfidf[pId] = row['tfxidf']

### Read in Inverted Indexes

In [234]:
invindex_title = pd.read_csv('product_title_inverted_index_revised.csv', delimiter = '\t', header = None, names = ['Term','IDF','Postings'])
invindex_desc = pd.read_csv('product_description_inverted_index_revised.csv', delimiter = '\t', header = None, names = ['Term','IDF','Postings'])
invindex_att = pd.read_csv('product_attribute_inverted_index_revised.csv', delimiter = '\t', header = None, names = ['Term','IDF','Postings'])

In [235]:
T_invindex = {}                                   #Title Documents Inverted Index
for index,row in invindex_title.iterrows():
    term = row['Term']
    idf = row['IDF']
    T_invindex[term] = (idf,{})
    postings = row['Postings']
    postings = postings.strip('{')
    postings = postings.strip('}')
    postings = postings.split(',')
    for i in postings:
        string = i.strip()
        post = string.split(': ')
        post[0] = int(post[0])
        post[1] = int(post[1])
        T_invindex[term][1][post[0]] = post[1]

In [236]:
D_invindex = {}                                    # Description Documents Inverted Index
for index,row in invindex_desc.iterrows():
    term = row['Term']
    idf = row['IDF']
    D_invindex[term] = (idf,{})
    postings = row['Postings']
    postings = postings.strip('{')
    postings = postings.strip('}')
    postings = postings.strip()
    postings = postings.split(',')
    for i in postings:
        post = i.split(': ')
        post[0] = int(post[0])
        post[1] = int(post[1])
        D_invindex[term][1][post[0]] = post[1]

In [237]:
A_invindex = {}                                   # Attribute Documents Inverted Index
for index,row in invindex_att.iterrows():
    term = row['Term']
    idf = row['IDF']
    A_invindex[term] = (idf,{})
    postings = row['Postings']
    postings = postings.strip('{')
    postings = postings.strip('}')
    postings = postings.strip()
    postings = postings.split(',')
    for i in postings:
        post = i.split(': ')
        post[0] = int(post[0])
        post[1] = int(post[1])
        A_invindex[term][1][post[0]] = post[1]

### String Formatting

Credit to: S. Li, “str_stem,” GitHubGist, Oct-2018. [Online]. Available: https://gist.github.com/susanli2016/b83d148de7394821509bd5172d2c96d3

In [204]:
stemmer = nltk.PorterStemmer()
def str_stem(s): 
    if isinstance(s, str):
        s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
        s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
        s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
        s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
        s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
        s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
        s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
        s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
        s = re.sub(r"([0-9]+)( *)(°|degrees|degree)\.?", r"\1 deg. ", s)
        s = re.sub(r"([0-9]+)( *)(v|volts|volt)\.?", r"\1 volt. ", s)
        s = re.sub(r"([0-9]+)( *)(wattage|watts|watt)\.?", r"\1 watt. ", s)
        s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1 amp. ", s)
        s = re.sub(r"([0-9]+)( *)(qquart|quart)\.?", r"\1 qt. ", s)
        s = re.sub(r"([0-9]+)( *)(hours|hour|hrs.)\.?", r"\1 hr ", s)
        s = re.sub(r"([0-9]+)( *)(gallons per minute|gallon per minute|gal per minute|gallons/min.|gallons/min)\.?", r"\1 gal. per min. ", s)
        s = re.sub(r"([0-9]+)( *)(gallons per hour|gallon per hour|gal per hour|gallons/hour|gallons/hr)\.?", r"\1 gal. per hr ", s)
        # Deal with special characters
        s = s.replace("$"," ")
        s = s.replace("?"," ")
        s = s.replace("..."," ")
        s = s.replace(".."," ")
        s = s.replace("&nbsp;"," ")
        s = s.replace("&amp;","&")
        s = s.replace("&#39;","'")
        s = s.replace("/>/Agt/>","")
        s = s.replace("</a<gt/","")
        s = s.replace("gt/>","")
        s = s.replace("/>","")
        s = s.replace("<br","")
        s = s.replace("<.+?>","")
        s = s.replace("[ &<>)(_,;:!?\+^~@#\$]+"," ")
        s = s.replace("'s\\b","")
        s = s.replace("[']+","")
        s = s.replace("[\"]+","")
        s = s.replace("-"," ")
        s = s.replace("+"," ")
        # Remove text between paranthesis/brackets)
        s = s.replace("[ ]?[[(].+?[])]","")
        # remove sizes
        s = s.replace("size: .+$","")
        s = s.replace("size [0-9]+[.]?[0-9]+\\b","")
        
        
        return " ".join([stemmer.stem(re.sub('[^A-Za-z0-9-./]', ' ', word)) for word in s.lower().split()])
    else:
        return "null"

### Function that tokenizes a string

In [205]:
def tokenize (str1) :
    str1 = str_stem(str1)
    tokens = word_tokenize(str1)
    tokens = [w for w in tokens if w not in stopwords.words('english')]
    tokens = [w for w in tokens if w != '.' and w != '/']
    return tokens  

### Function that returns a set of synonyms,hypernyms, and hyponyms for a given term

In [164]:
def synonyms (str1):
    vocab = []

    for syn in wordnet.synsets(str1):
        for l in syn.lemmas():
            line = l.name().split('_')
            for i in line:
                vocab.append(i)
        for hyper in syn.hypernyms():
            line = hyper.name().split('_')
            for i in line:
                vocab.append(i)
        for hypo in syn.hyponyms():
            line = hypo.name().split('_')
            for i in line:
                vocab.append(i)
    vocab = list(set(vocab))
    return vocab

### Product Title Cosine Similarities on Train Data

In [239]:
cs_list_title = []                    #List of cosine similarities for title documents on train data
porter = nltk.PorterStemmer()

#Cosine Similarities between Query and Product Titles for Train Data
for index, row in train.iterrows():
    weights = []                        # weights of all terms in the query
    word_set = []                       # List of lists. Lists contain related words for query terms
    score = 0.0
    query = row['search_term']
    doc = row['product_uid']
    # here we split the query, and create a list of related words for each term. Append to word_set
    for word in query.split():
        syns = synonyms(word)
        if not syns:
            word_set.append([word])
        else:
            word_set.append([word] + syns)
            
    # Here we are eseentially scoring based on each word in the synset. Checking the title for all related words
    for words in word_set:
        toks = []
        # for each word in the synset
        for word in words:
            if word in stopwords.words('english'):
                continue
            tokens = tokenize(word)
            if not tokens or tokens[0] not in T_invindex:
                continue
            elif tokens[0] in T_invindex:
                if doc in T_invindex[tokens[0]][1]:
                    toks.append(tokens[0])
        if not toks:
            continue
        for t in toks:
            P = T_invindex[t][1]          # postings of term
            I = T_invindex[t][0]          # idf of term
            K = query.count(words[0])           # tf of term in query
            W = K * I                       # weight of the term in the query
            weights.append(W)
            C = P[doc]                # tf of term in document
            score = score + W *C*I
    
    if not weights:
        cs_list_title.append(0)
        continue
    L = math.sqrt(sum(i*i for i in weights))   # length of query vector
    S = score
    Y = T_tfidf[doc]
    if Y == 0:
        cs_list_title.append(0)
        continue
    cs_list_title.append(S / (L * Y))

### Product Title Cosine Similarities on Prediction Data

In [None]:
cs_list_title_test = []                                  #List of cosine similarities for title documents on test data
porter = nltk.PorterStemmer()

#Cosine Similarities between Query and Product Titles for Train Data
for index, row in test.iterrows():
    weights = []                        # weights of all terms in the query
    word_set = []
    score = 0.0
    query = row['search_term']
    doc = row['product_uid']
    for word in query.split():
        syns = synonyms(word)
        if not syns:
            word_set.append([word])
        else:
            word_set.append([word] + syns)
    for words in word_set:
        toks = []
        # for each word in the synset
        for word in words:
            if word in stopwords.words('english'):
                continue
            tokens = tokenize(word)
            if not tokens or tokens[0] not in T_invindex:
                continue
            elif tokens[0] in T_invindex:
                if doc in T_invindex[tokens[0]][1]:
                    toks.append(tokens[0])
        if not toks:
            continue
        for t in toks:
            P = T_invindex[t][1]          # postings of term
            I = T_invindex[t][0]          # idf of term
            K = query.count(words[0])           # tf of term in query
            W = K * I                       # weight of the term in the query
            weights.append(W)
            C = P[doc]                # tf of term in document
            score = score + W *C*I
    
    if not weights:
        cs_list_title_test.append(0)
        continue
    L = math.sqrt(sum(i*i for i in weights))   # length of query vector
    S = score
    Y = T_tfidf[doc]
    if Y == 0:
        cs_list_title_test.append(0)
        continue
    cs_list_title_test.append(S / (L * Y))

### Product Description Cosine Similarities on Train Data

In [17]:
cs_list_description = []                              #List of cosine similarities for description documents on train data
porter = nltk.PorterStemmer()

#Cosine Similarities between Query and Product Titles for Train Data
for index, row in train.iterrows():
    weights = []                        # weights of all terms in the query
    word_set = []
    score = 0.0
    query = row['search_term']
    doc = row['product_uid']
    for word in query.split():
        syns = synonyms(word)
        if not syns:
            word_set.append([word])
        else:
            word_set.append([word] + syns)
    for words in word_set:
        tok = ""
        # for each word in the synset
        for word in words:
            if word in stopwords.words('english'):
                continue
            tokens = tokenize(word)
            if not tokens or tokens[0] not in D_invindex:
                continue
            elif tokens[0] in D_invindex:
                if doc in D_invindex[tokens[0]][1]:
                    tok = tokenize(word)[0]
                    break
        if tok == "":
            continue
        P = D_invindex[tok][1]          # postings of term
        I = D_invindex[tok][0]          # idf of term
        K = query.count(words[0])           # tf of term in querya
        W = K * I                       # weight of the term in the query
        weights.append(W)
        C = P[doc]                # tf of term in document
        score = score + W *C*I
    
    if not weights:
        cs_list_description.append(0)
        continue
    L = math.sqrt(sum(i*i for i in weights))   # length of query vector
    S = score
    Y = T_tfidf[doc]
    if Y == 0:
        cs_list_description.append(0)
        continue
    cs_list_description.append(S / (L * Y))

### Product Description Cosine Similarities on Prediction Data

In [18]:
cs_list_description_test = []                       #List of cosine similarities for description documents on test data
porter = nltk.PorterStemmer()

#Cosine Similarities between Query and Product Titles for Train Data
for index, row in test.iterrows():
    weights = []                        # weights of all terms in the query
    word_set = []
    score = 0.0
    query = row['search_term']
    doc = row['product_uid']
    for word in query.split():
        syns = synonyms(word)
        if not syns:
            word_set.append([word])
        else:
            word_set.append([word] + syns)
    for words in word_set:
        tok = ""
        # for each word in the synset
        for word in words:
            if word in stopwords.words('english'):
                continue
            tokens = tokenize(word)
            if not tokens or tokens[0] not in D_invindex:
                continue
            elif tokens[0] in D_invindex:
                if doc in D_invindex[tokens[0]][1]:
                    tok = tokenize(word)[0]
                    break
        if tok == "":
            continue
        P = D_invindex[tok][1]          # postings of term
        I = D_invindex[tok][0]          # idf of term
        K = query.count(words[0])           # tf of term in querya
        W = K * I                       # weight of the term in the query
        weights.append(W)
        C = P[doc]                # tf of term in document
        score = score + W *C*I
    
    if not weights:
        cs_list_description_test.append(0)
        continue
    L = math.sqrt(sum(i*i for i in weights))   # length of query vector
    S = score
    Y = T_tfidf[doc]
    if Y == 0:
        cs_list_description_test.append(0)
        continue
    cs_list_description_test.append(S / (L * Y))

### Product Attribute Cosine Similarities on Train Data

In [19]:
cs_list_attribute = []                                    #List of cosine similarities for attribute documents on train data
porter = nltk.PorterStemmer()

#Cosine Similarities between Query and Product Titles for Train Data
for index, row in train.iterrows():
    weights = []                        # weights of all terms in the query
    word_set = []
    score = 0.0
    query = row['search_term']
    doc = row['product_uid']
    for word in query.split():
        syns = synonyms(word)
        if not syns:
            word_set.append([word])
        else:
            word_set.append([word] + syns)
    for words in word_set:
        tok = ""
        # for each word in the synset
        for word in words:
            if word in stopwords.words('english'):
                continue
            tokens = tokenize(word)
            if not tokens or tokens[0] not in D_invindex:
                continue
            elif tokens[0] in A_invindex:
                if doc in A_invindex[tokens[0]][1]:
                    tok = tokenize(word)[0]
                    break
        if tok == "":
            continue
        P = A_invindex[tok][1]          # postings of term
        I = A_invindex[tok][0]          # idf of term
        K = query.count(words[0])           # tf of term in querya
        W = K * I                       # weight of the term in the query
        weights.append(W)
        C = P[doc]                # tf of term in document
        score = score + W *C*I
    
    if not weights:
        cs_list_attribute.append(0)
        continue
    L = math.sqrt(sum(i*i for i in weights))   # length of query vector
    S = score
    Y = T_tfidf[doc]
    if Y == 0:
        cs_list_attribute.append(0)
        continue
    cs_list_attribute.append(S / (L * Y))

### Product Attribute Cosine Similarities on Prediction Data

In [20]:
cs_list_attribute_test = []                         #List of cosine similarities for attribute documents on test data
porter = nltk.PorterStemmer()

#Cosine Similarities between Query and Product Titles for Train Data
for index, row in test.iterrows():
    weights = []                        # weights of all terms in the query
    word_set = []
    score = 0.0
    query = row['search_term']
    doc = row['product_uid']
    for word in query.split():
        syns = synonyms(word)
        if not syns:
            word_set.append([word])
        else:
            word_set.append([word] + syns)
    for words in word_set:
        tok = ""
        # for each word in the synset
        for word in words:
            if word in stopwords.words('english'):
                continue
            tokens = tokenize(word)
            if not tokens or tokens[0] not in D_invindex:
                continue
            elif tokens[0] in A_invindex:
                if doc in A_invindex[tokens[0]][1]:
                    tok = tokenize(word)[0]
                    break
        if tok == "":
            continue
        P = A_invindex[tok][1]          # postings of term
        I = A_invindex[tok][0]          # idf of term
        K = query.count(words[0])           # tf of term in querya
        W = K * I                       # weight of the term in the query
        weights.append(W)
        C = P[doc]                # tf of term in document
        score = score + W *C*I
    
    if not weights:
        cs_list_attribute_test.append(0)
        continue
    L = math.sqrt(sum(i*i for i in weights))   # length of query vector
    S = score
    Y = T_tfidf[doc]
    if Y == 0:
        cs_list_attribute_test.append(0)
        continue
    cs_list_attribute_test.append(S / (L * Y))

### Write Cosine Similarities to csv

Write the similarities to csv, so that the entire code doesn't have to be run every time. These take a long time to make.

In [None]:
with open('cs_list_title.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for s in cs_list_title:
        csvwriter.writerow([s])
with open('cs_list_title_test.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for s in cs_list_title_test:
        csvwriter.writerow([s])
        
with open('cs_list_description.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for s in cs_list_description:
        csvwriter.writerow([s])
with open('cs_list_description_test.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for s in cs_list_description_test:
        csvwriter.writerow([s])
        
with open('cs_list_attribute.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for s in cs_list_attribute:
        csvwriter.writerow([s])
with open('cs_list_attribute_test.csv', 'w') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='\t')
    for s in cs_list_attribute_test:
        csvwriter.writerow([s])

In [None]:
#### This reads in cosine similarities from the csv file. Useful if we need cosine similarities without having to run the algorithms again.
if read_in_csim == 1:
    cs_list_title = []
    cs_list_title_test = []
    cs_list_description = []
    cs_list_description_test = []
    cs_list_attribute = []
    cs_list_attribute_test = []

    df1 = pd.read_csv('cs_list_title.csv', delimiter = '\t', header = None, names = ['sim'])
    df2 = pd.read_csv('cs_list_title_test.csv', delimiter = '\t', header = None, names = ['sim'])
    df3 = pd.read_csv('cs_list_description.csv', delimiter = '\t', header = None, names = ['sim'])
    df4 = pd.read_csv('cs_list_description_test.csv', delimiter = '\t', header = None, names = ['sim'])
    df5 = pd.read_csv('cs_list_attribute.csv', delimiter = '\t', header = None, names = ['sim'])
    df6 = pd.read_csv('cs_list_attribute_test.csv', delimiter = '\t', header = None, names = ['sim'])
    
    for index, row in df1.iterrows():
        cs_list_title.append(row['sim'])
    for index, row in df2.iterrows():
        cs_list_title_test.append(row['sim'])
    for index, row in df3.iterrows():
        cs_list_description.append(row['sim'])
    for index, row in df4.iterrows():
        cs_list_description_test.append(row['sim'])
    for index, row in df5.iterrows():
        cs_list_attribute.append(row['sim'])
    for index, row in df6.iterrows():
        cs_list_attribute_test.append(row['sim'])

In [None]:
#covert the cosine csv lists to pandas dataframes
df1 = pd.DataFrame(cs_list_title)
df2 = pd.DataFrame(cs_list_title_test)
df3 = pd.DataFrame(cs_list_description)
df4 = pd.DataFrame(cs_list_description_test)
df5 = pd.DataFrame(cs_list_attribute)
df6 = pd.DataFrame(cs_list_attribute_test)

In [258]:
#combine the cosine lists for training data and set names for the columns
new_train = pd.concat([train, df1], ignore_index = True, axis = 1)
new_train = pd.concat([new_train, df3], ignore_index = True, axis = 1)
new_train = pd.concat([new_train, df5], ignore_index = True, axis = 1)
new_train.columns = ['id','product_uid','product_title','search_term','relevance','cosine_title','cosine_description','cosine_attribute']

#combine the cosine lists for test data and set names for the columns
new_test = pd.concat([test, df2], ignore_index = True, axis = 1)
new_test = pd.concat([new_test, df4], ignore_index = True, axis = 1)
new_test = pd.concat([new_test, df6], ignore_index = True, axis = 1)
new_test.columns = ['id','product_uid','product_title','search_term','cosine_title','cosine_description','cosine_attribute']

### Computing Length Features

In [259]:
# read in the descriptions file
descriptions = pd.read_csv('product_descriptions_new.csv', delimiter = '\t', header = 0,quoting=csv.QUOTE_NONE, error_bad_lines=False)
descriptions.columns = ['product_uid','product_description']

#merge the two pandas datasets on the product uid. 
#Left merge because if there is not a description, we still want to keep the product
new_train = new_train.merge(descriptions, on = 'product_uid', how = 'left')
new_test = new_test.merge(descriptions, on = 'product_uid', how = 'left')

In [46]:
# count the terms of query, title, description for the train and test data
query_trm_ct = []
for i in range(len(new_train)):
    string = new_train['search_term'][i]
    count = len(re.findall("[a-zA-Z_]+", string))
    query_trm_ct.append(count)
    
query_trm_ct_test = []
for i in range(len(new_test)):
    string = new_test['search_term'][i]
    count = len(re.findall("[a-zA-Z_]+", string))
    query_trm_ct_test.append(count)
    
title_trm_ct = []
for i in range(len(new_train)):
    string = new_train['product_title'][i]
    count = len(re.findall("[a-zA-Z_]+", string))
    title_trm_ct.append(count)
    
title_trm_ct_test = []
for i in range(len(new_test)):
    string = new_test['product_title'][i]
    count = len(re.findall("[a-zA-Z_]+", string))
    title_trm_ct_test.append(count)
    
description_trm_ct = []
for i in range(len(new_train)):
    string = new_train['product_description'][i]
    count = len(re.findall("[a-zA-Z_]+", string))
    description_trm_ct.append(count)
    
description_trm_ct_test = []
for i in range(len(new_test)):
    string = new_test['product_description'][i]
    count = len(re.findall("[a-zA-Z_]+", string))
    description_trm_ct_test.append(count)

In [260]:
#concatanate the counts for query, title and description with the training and test data
df = pd.DataFrame(query_trm_ct)
new_train = pd.concat([new_train, df], ignore_index = True, axis = 1)

df2 = pd.DataFrame(query_trm_ct_test)
new_test = pd.concat([new_test, df2], ignore_index = True, axis = 1)

df = pd.DataFrame(title_trm_ct)
new_train = pd.concat([new_train, df], ignore_index = True, axis = 1)

df2 = pd.DataFrame(title_trm_ct_test)
new_test = pd.concat([new_test, df2], ignore_index = True, axis = 1)

df = pd.DataFrame(description_trm_ct)
new_train = pd.concat([new_train, df], ignore_index = True, axis = 1)

df2 = pd.DataFrame(description_trm_ct_test)
new_test = pd.concat([new_test, df2], ignore_index = True, axis = 1)

#rename columns
new_train.columns = ['id','product_uid','product_title','search_term','relevance','cosine_title','cosine_description','cosine_attribute','product_description','query_ct','title_ct','desc_ct']
new_test.columns = ['id','product_uid','product_title','search_term','cosine_title','cosine_description','cosine_attribute','product_description','query_ct','title_ct','desc_ct']


#### Creates list of binary values. 1 if entire query appears exactly as is in either Description or Title, 0 otherwise

In [47]:
def contains_word(s, w):
    return f' {w} ' in f' {s} '

true_list = []
for index, row in new_train.iterrows():
    query = row['search_term']
    description = row['product_description']
    title = row['product_title']
    if contains_word(description, query) != 0 or contains_word(title, query) != 0:
        true_list.append(1)
    else:
        true_list.append(0)
        
        
true_list_test = []
for index, row in new_test.iterrows():
    query = row['search_term']
    description = row['product_description']
    title = row['product_title']
    if contains_word(description, query) != 0 or contains_word(title, query) != 0:
        true_list_test.append(1)
    else:
        true_list_test.append(0)

### Computing Ratio Features

In [214]:
# title length / query length
title_query_ratio_train = []
title_query_ratio_test = []

for index, row in new_train.iterrows():
    title = row['product_title']
    query = row['search_term']
    tcnt = len(tokenize(title))
    qcnt = len(tokenize(query))
    if qcnt == 0:
        title_query_ratio_train.append(0)
        continue
    title_query_ratio_train.append(tcnt/qcnt)
    
for index, row in new_test.iterrows():
    title = row['product_title']
    query = row['search_term']
    tcnt = len(tokenize(title))
    qcnt = len(tokenize(query))
    if qcnt == 0:
        title_query_ratio_test.append(0)
        continue
    title_query_ratio_test.append(tcnt/qcnt)

In [216]:
# description length / query length
desc_query_ratio_train = []
desc_query_ratio_test = []

for index, row in new_train.iterrows():
    desc = row['product_description']
    query = row['search_term']
    dcnt = len(tokenize(desc))
    qcnt = len(tokenize(query))
    if qcnt == 0:
        desc_query_ratio_train.append(0)
        continue
    desc_query_ratio_train.append(dcnt/qcnt)
    
for index, row in new_test.iterrows():
    desc = row['product_description']
    query = row['search_term']
    dcnt = len(tokenize(desc))
    qcnt = len(tokenize(query))
    if qcnt == 0:
        desc_query_ratio_test.append(0)
        continue
    desc_query_ratio_test.append(dcnt/qcnt)

In [217]:
#Percantage of search term in title
tq_common_ratio_train = []
tq_common_ratio_test = []

for index, row in new_train.iterrows():
    cnt = 0
    title = row['product_title']
    query = row['search_term']
    t_tok = tokenize(title)
    q_tok = tokenize(query)
    if not q_tok:
        tq_common_ratio_train.append(0)
        continue
    for tok in q_tok:
        if tok in t_tok:
            cnt = cnt + 1
    tq_common_ratio_train.append(cnt/len(q_tok))
    
for index, row in new_test.iterrows():
    cnt = 0
    title = row['product_title']
    query = row['search_term']
    t_tok = tokenize(title)
    q_tok = tokenize(query)
    if not q_tok:
        tq_common_ratio_test.append(0)
        continue
    for tok in q_tok:
        if tok in t_tok:
            cnt = cnt + 1
    tq_common_ratio_test.append(cnt/len(q_tok))

In [218]:
#Percantage of search term in description
dq_common_ratio_train = []
dq_common_ratio_test = []

for index, row in new_train.iterrows():
    cnt = 0
    description = row['product_description']
    query = row['search_term']
    d_tok = tokenize(description)
    q_tok = tokenize(query)
    if not q_tok:
        dq_common_ratio_train.append(0)
        continue
    for tok in q_tok:
        if tok in d_tok:
            cnt = cnt + 1
    dq_common_ratio_train.append(cnt/len(q_tok))
    
for index, row in new_test.iterrows():
    cnt = 0
    description = row['product_description']
    query = row['search_term']
    d_tok = tokenize(description)
    q_tok = tokenize(query)
    if not q_tok:
        dq_common_ratio_test.append(0)
        continue
    for tok in q_tok:
        if tok in d_tok:
            cnt = cnt + 1
    dq_common_ratio_test.append(cnt/len(q_tok))

# Jaccard Similarity 

In [28]:
# definition for jaccard similarity
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) #split strings
    b = set(str2.split())
    c = a.intersection(b) #find the common words
    return float(len(c)) / (len(a) + len(b) - len(c)) 

#similarity for product title and query - train
jac_list_title_train = []
#iterate though train data
for index, row in new_train.iterrows():
    #toeknize the product and query
    query = row['search_term']
    query = tokenize(query)
    query = ' '.join(query)
    
    title = row['product_title']
    title = tokenize(title)
    title = ' '.join(title)
    
    jac = get_jaccard_sim(title, query)
    jac_list_title_train.append(jac)

#similarity for product title and query - test    
jac_list_title_test = []
# iterate through test 
for index, row in new_test.iterrows():
    #tokenize and call jaccard definition
    query = row['search_term']
    query = tokenize(query)
    query = ' '.join(query)
    
    title = row['product_title']
    title = tokenize(title)
    title = ' '.join(title)
    
    jac = get_jaccard_sim(title, query)
    jac_list_title_test.append(jac)

### Brands
#### Try to match Brand names in query with brand names in either title or description. If brand match : 1, else: 0
#### Tried including as many Brand names as possible in 'brands.txt', still got very few hits when in reality there should have been alot more.

brands = pd.read_csv('brands.txt', delimiter = ',', header = None)<br>
brand_list_train = []<br>
for index, row in new_train.iterrows(): <br>

    query = row['search_term']
    description = row['product_description']
    title = row['product_title']
    for i, r in brands.iterrows():
        b = r[0].lower()<br>
        if contains_word(b,query) == 0:
            continue
        if contains_word(b, description) != 0 or contains_word(b, title) != 0:
            brand_list_train.append(1)
        else:
            brand_list_train.append(0)

brand_list_test = [] <br>
for index, row in new_test.iterrows():

    query = row['search_term']
    description = row['product_description']
    title = row['product_title']
    for i, r in brands.iterrows():
        b = r[0].lower()
        if contains_word(b,query) == 0:
            continue
        if contains_word(b, description) != 0 or contains_word(b, title) != 0:
            brand_list_train.append(1)
        else:
            brand_list_train.append(0)

## WORD2VEC
#### Loaded in 100-dimension word vectors from glove repository
##### This did not work as there were too many words not in the vocabulary

glove_input_file = 'glove.6B.100d.txt'<br>
word2vec_output_file = 'glove.6B.100d.txt.word2vec'<br>
glove2word2vec(glove_input_file, word2vec_output_file)<br>

from gensim.models import KeyedVectors # load the Stanford GloVe model<br>
filename = 'glove.6B.100d.txt.word2vec'<br>
model = KeyedVectors.load_word2vec_format(filename, binary=False)

for index, row in train.iterrows():

    query = row['search_term'].split()
    for word in query:
        vector = model[word]

### Change relevance to be the target variable and move to index 0

In [220]:
new_train.insert(0, 'target', new_train['relevance'])

In [221]:
del new_train['relevance']

### remove the catagorical features

In [None]:
del new_train["product_title"]
del new_train["search_term"]
del new_train['product_description']
del new_train['product_uid']

del new_test["product_title"]
del new_test["search_term"]
del new_test['product_description']
del new_test['product_uid']

# add Jaccard and search list

In [223]:
# add jaccard 
df = pd.DataFrame(jac_list_title_train)
new_train = pd.concat([new_train, df], ignore_index = True, axis = 1)

df2 = pd.DataFrame(jac_list_title_test)
new_test = pd.concat([new_test, df2], ignore_index = True, axis = 1)

# add search list to train
df3 = pd.DataFrame(true_list)
new_train = pd.concat([new_train, df], ignore_index = True, axis = 1)

# add search list to test
df4 = pd.DataFrame(true_list_test)
new_test = pd.concat([new_test, df2], ignore_index = True, axis = 1)

#rename columns
new_train.columns = ['target', 'id', 'cosine_title','cosine_description','cosine_attribute','query_ct','title_ct','desc_ct', 'jac_title', 'exact']
new_test.columns = ['id','cosine_title','cosine_description','cosine_attribute','query_ct','title_ct','desc_ct', 'jac_title','exact']


In [224]:
# add ratios to train and test dataframe
df1 = pd.DataFrame(title_query_ratio_train)
df2 = pd.DataFrame(title_query_ratio_test)
df3 = pd.DataFrame(desc_query_ratio_train)
df4 = pd.DataFrame(desc_query_ratio_test)
df5 = pd.DataFrame(tq_common_ratio_train)
df6 = pd.DataFrame(tq_common_ratio_test)
df7 = pd.DataFrame(dq_common_ratio_train)
df8 = pd.DataFrame(dq_common_ratio_test)

new_train = pd.concat([new_train, df1], ignore_index = True, axis = 1)
new_train = pd.concat([new_train, df3], ignore_index = True, axis = 1)
new_train = pd.concat([new_train, df5], ignore_index = True, axis = 1)
new_train = pd.concat([new_train, df7], ignore_index = True, axis = 1)

new_test = pd.concat([new_test, df2], ignore_index = True, axis = 1)
new_test = pd.concat([new_test, df4], ignore_index = True, axis = 1)
new_test = pd.concat([new_test, df6], ignore_index = True, axis = 1)
new_test = pd.concat([new_test, df8], ignore_index = True, axis = 1)

#because we used concat we have to rename the columns of the dataframes
new_train.columns = ['target', 'id', 'cosine_title','cosine_description','cosine_attribute','query_ct','title_ct','desc_ct', 'jac_title', 'exact','title_query_ratio','desc_query_ratio','tq_common_ratio','dq_common_ratio']
new_test.columns = ['id','cosine_title','cosine_description','cosine_attribute','query_ct','title_ct','desc_ct', 'jac_title', 'exact','title_query_ratio','desc_query_ratio','tq_common_ratio','dq_common_ratio']


In [225]:
#save cleaned files with features for regression
new_train.to_csv(r'forRegression.csv', index=None, header = True)
new_test.to_csv(r'forRegressionTest.csv', index=None, header = True)