In [1]:
import csv
def read_csv(filename, hasHeader=False):
    data = []
    with open(filename) as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        if (hasHeader):
            next(reader, None)
            
        for row in reader:
            data.append(row)
    
    return data

In [2]:
import csv
def write_csv(filename, header, data):
    with open(filename, 'w+') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"')
        if header is not None:
            writer.writerow(header)
            
        for row in data:
            writer.writerow(row)

In [3]:
import os
def file_available(filename):
    return os.path.exists(filename)\
        and os.path.isfile(filename)\
        and os.access("myfile", os.R_OK)

In [4]:
COL_ID=0
COL_CATEGORY_ID=1
COL_TITLE=2
COL_DESCRIPTION=3
COL_IMAGES=4
COL_JSON=5
COL_PRICE=6
COL_LOCATION=7
COL_METRO=8
COL_LAT=9
COL_LON=10

In [5]:
import imagehash

# 0 means absolutely equal (int)
IMAGE_DIFFERENCE_THRESHOLD = 0

def load_image_hashes():
    map_hashes = {}
    for i in range(10):
        data = read_csv('processed_data/hashes' + str(i) + '.csv')
        for row in data:
            map_hashes[row[0]] = row[1]
    return map_hashes

def number_of_same_images(image_hashes, item1, item2):
    imgs1 = item1[COL_IMAGES].split(',')
    imgs2 = item2[COL_IMAGES].split(',')
    imgs1 = set(filter(bool, [x.strip() for x in imgs1]))
    imgs2 = set(filter(bool, [x.strip() for x in imgs2]))
    
    count = 0
    for img1 in imgs1:
        for img2 in imgs2:
            try:
                if imagehash.hex_to_hash(image_hashes[img1]) -\
                    imagehash.hex_to_hash(image_hashes[img2]) \
                    <= IMAGE_DIFFERENCE_THRESHOLD:
                    count += 1
                    break
            except KeyError as e:
                print(e)
                
    minlen = min(len(imgs1), len(imgs2))
    
    if minlen == 0:
        print ('Some items have 0 images', item1[COL_ID], item2[COL_ID])
        return (count, 0)
    
    return (count, count / minlen)

def load_items_map():
    items = read_csv('data/ItemInfo_train.csv', hasHeader=True)
    print('Train items', len(items))
    print('Item example', items[0])
    map_items = dict()
    for item in items:
        map_items[item[COL_ID]] = item
    return map_items

def load_item_pairs():
    pairs = read_csv('data/ItemPairs_train.csv', hasHeader=True)
    print('Train pairs', len(pairs))
    print('Example pair', pairs[0])
    return pairs

In [18]:
def titles():
    map_hashes = {}
    data = read_csv('titles_sim.csv')
    for row in data:
        map_hashes[row[0] + '+' + row[1]] = row[2]
    return map_hashes

def descriptions():
    map_hashes = {}
    data = read_csv('descriptions_sim.csv')
    for row in data:
        map_hashes[row[0] + '+' + row[1]] = row[2]
    return map_hashes

In [7]:
def get_categories():
    categories = read_csv('data/Category.csv')
    print('Categories', len(categories))
    map_parent_category = dict()
    for category in categories:
        map_parent_category[category[0]] = category[1]
    return map_parent_category

def get_regions():
    locations = read_csv('data/Location.csv', hasHeader=True)
    print('Locations', len(locations))
    map_regions = dict()
    for location in locations:
        map_regions[location[0]] = location[1]
    return map_regions

In [19]:
titles_map = titles()
descriptions_map = descriptions()

In [9]:
image_hashes = load_image_hashes()

In [10]:
map_parent_category = get_categories()
map_regions = get_regions()

Categories 52
Locations 3449


In [15]:
def same_category(item1, item2):
    cat1 = item1[COL_CATEGORY_ID]
    cat2 = item2[COL_CATEGORY_ID]
    
    return 1 if cat1 == cat2 else 0

def same_parent_category(item1, item2):
    cat1 = item1[COL_CATEGORY_ID]
    cat2 = item2[COL_CATEGORY_ID]
    
    par_cat1 = map_parent_category[cat1]
    par_cat2 = map_parent_category[cat2]
    
    return 1 if par_cat1 == par_cat2 else 0

def same_price(item1, item2):
    price1 = item1[COL_PRICE]
    price2 = item2[COL_PRICE]
    
    return 1 if price1 == price2 else 0

def safe_parse(str):
    try:
        return float(item1[COL_PRICE].replace(',', ''))
    except:
        return 0.0

def price_diff_percent(item1, item2):
    price1 = safe_parse(item1[COL_PRICE])
    price2 = safe_parse(item2[COL_PRICE])
    
    import math
    avg_price = (price1 + price2) / 2.0
    
    if avg_price != 0:
        return math.fabs(price1 - price2) / avg_price
    else:
        return 0

def same_lat(item1, item2):
    lat1 = item1[COL_LAT]
    lat2 = item2[COL_LAT]
    
    return 1 if lat1 == lat2 else 0

def same_lon(item1, item2):
    lon1 = item1[COL_LON]
    lon2 = item2[COL_LON]
    
    return 1 if lon1 == lon2 else 0

def same_location(item1, item2):
    location1 = item1[COL_LOCATION]
    location2 = item2[COL_LOCATION]
    
    return 1 if location1 == location2 else 0

def distance_between_coordinates(item1, item2):
    lat1 = float(item1[COL_LAT])
    lat2 = float(item2[COL_LAT])
    lon1 = float(item1[COL_LON])
    lon2 = float(item2[COL_LON])
    
    point1 = (lon1, lat1)
    point2 = (lon2, lat2)
    
    from geopy.distance import vincenty
    return vincenty(point1, point2).miles

def same_region(item1, item2):
    location1 = item1[COL_LOCATION]
    location2 = item2[COL_LOCATION]
    
    region1 = map_regions[location1]
    region2 = map_regions[location2]
    
    return 1 if region1 == region2 else 0

def same_metro(item1, item2):
    metro1 = item1[COL_METRO]
    metro2 = item2[COL_METRO]
    
    return 1 if metro1 == metro2 else 0

def get_features(item1, item2, label):
    fx = []
    fx.append(same_category(item1, item2))
    fx.append(same_parent_category(item1, item2))
    
    fx.append(same_price(item1, item2))
    fx.append(price_diff_percent(item1, item2))
    
    fx.append(same_lat(item1, item2))
    fx.append(same_lon(item1, item2))
    fx.append(same_location(item1, item2))
    fx.append(distance_between_coordinates(item1, item2))
    
    fx.append(same_region(item1, item2))
    fx.append(same_metro(item1, item2))
    
    fx.append(titles_map[item1[COL_ID] + '+' + item2[COL_ID]])
    fx.append(descriptions_map[item1[COL_ID] + '+' + item2[COL_ID]])
    
    fx.append(same_images(item1, item2))
    fx.append(number_of_same_images(item1, item2))
    
    return (fx, label)

In [None]:
items_map = load_items_map()
pairs = load_item_pairs()

In [20]:
def get_train_data():   
    data = []
    for pair in pairs:
        item1 = items_map[pair[0]]
        item2 = items_map[pair[1]]
        label = int(pair[2])

        xy = get_features(item1, item2, label)
        data.append(xy)
        
    return data

train = get_train_data()
print(train[0])

KeyError: '1+4112648'

In [None]:
# Free up memory
items_map = None
pairs = None