In [2]:
import csv
from collections import OrderedDict
from collections import defaultdict
import pickle

training_path = r"..\data\train.csv"
descriptions_path = r"..\data\product_descriptions.csv"
attributes_path = r"..\data\attributes.csv"

In [4]:
def load_data(filepath:str, table:str):
    if table == 'train':
        result = []
        with open(filepath, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')  
            next(reader)
            for row in reader:
                instance = {'id':row[0],'product_uid':row[1],'product_title':row[2],'search_term':row[3],
                            'relevance':float(row[4])}
                result.append(instance)
    elif table == 'product_descriptions':
        result = {}
        with open(filepath, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')  
            next(reader)
            for row in reader:
                result[row[0]] = row[1]
    elif table == 'attributes':
        result = defaultdict(list)
        with open(filepath, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')  
            next(reader)
            for row in reader:
                result[row[0]].append((row[1],row[2]))
    return result

def common_subsequence(doc1, doc2):
    result = []
    d1 = doc1.split(' ')
    d2 = doc2.split(' ')
    
    d2_dict = OrderedDict()
    
    for word in d2:
        d2_dict[word] = word
    
    for word in d1:
        if word in d2_dict:
            result.append(word)
    
    return result

def training_example_id(query_doc_pair:dict,feat_dict:dict):
    feat_dict.update({'id':query_doc_pair['id']})

#Feature: common subsequence between query and title
#Arguments:
#query_doc_pair is dictionary containing a row from the train.csv table, where they keys are the column names
#description is a string containing the product description
#attributes is list containing all (name,value) tuples for the product
#feat_dict is a Dict(), which contains feature name-value pairs for other features extracted from
#the same document-query pair. Basically, we will update that dictionary with a new feature.
def feat_1(query_doc_pair:dict,description:str,attributes:list,feat_dict:dict):
    query = query_doc_pair['search_term']
    title = query_doc_pair['product_title']
    subsequence = common_subsequence(query,title)
    result = ' '.join(subsequence)
    feat_dict.update({'subsequence_query+title':result})

#Feature: common subsequence between query and description
#Arguments:
#query_doc_pair is dictionary containing a row from the train.csv table, where they keys are the column names
#description is a string containing the product description
#attributes is list containing all (name,value) tuples for the product
#feat_dict is a Dict(), which contains feature name-value pairs for other features extracted from
#the same document-query pair. Basically, we will update that dictionary with a new feature.
def feat_2(query_doc_pair:dict,description:str,attributes:list,feat_dict:dict):
    query = query_doc_pair['search_term']
    subsequence = common_subsequence(query,description)
    result = ' '.join(subsequence)
    feat_dict.update({'subsequence_query+description':result})    
    
#Feature: all product attributes
#Arguments:
#query_doc_pair is dictionary containing a row from the train.csv table, where they keys are the column names
#description is a string containing the product description
#attributes is list containing all (name,value) tuples for the product
#feat_dict is a Dict(), which contains feature name-value pairs for other features extracted from
#the same document-query pair. Basically, we will update that dictionary with a new feature.
def feat_3(query_doc_pair:dict,description:str,attributes:list,feat_dict:dict):    
    temp_dict = {}
    for attr in attributes:
        temp_dict[attr[0]] = attr[1]
        
    feat_dict.update(temp_dict)
    
#Feature 4: number of words in common between the query and the attributes,
#Feature 5: words in common between the query and the attribute
#Arguments:
#query_doc_pair is dictionary containing a row from the train.csv table, where they keys are the column names
#description is a string containing the product description
#attributes is list containing all (name,value) tuples for the product
#feat_dict is a Dict(), which contains feature name-value pairs for other features extracted from
#the same document-query pair. Basically, we will update that dictionary with a new feature.
def feat_4_5(query_doc_pair:dict,description:str,attributes:list,feat_dict:dict):
    query = set(query_doc_pair['search_term'].split(' '))
    
    all_words = set()
    
    for attr in attributes:
        all_words.update(set(attr[0].split(' ')))
        all_words.update(set(attr[1].split(' ')))
        
    common_words = query.intersection(all_words)
    
    
    if len(common_words) == 0:
        feat_dict.update({'common_word_count':0})
        return
    
    temp_dict = {}
    temp_dict['common_word_count'] = len(common_words)
    for w in common_words:
        temp_dict['common_word:'+w] = True
        
    feat_dict.update(temp_dict)

def process_event(product,description,attributes):
    feat_dict = {}
    feat_1(product,description,attributes,feat_dict)
    feat_2(product,description,attributes,feat_dict)
    feat_3(product,description,attributes,feat_dict)
    feat_4_5(product,description,attributes,feat_dict)
    return feat_dict

#Extracts features for all query-document pairs and returns a list of dictionaries, where each dictionary contains
#the features for each doc-query pair.
def feature_extraction(train:list,product_descriptions:dict,attibutes:dict):
    all_features = []
    for item in train:
        product_uid = item['product_uid']
        description = product_descriptions[product_uid]
        attributes_list = attributes[product_uid]
        all_features.append(process_event(item,description,attributes_list))
    return all_features

In [14]:
train = load_data(training_path,'train')
product_descriptions = load_data(descriptions_path,'product_descriptions')
attributes = load_data(attributes_path,'attributes')

In [17]:
all_feat = feature_extraction(train, product_descriptions, attributes)