In [1]:
import csv
import codecs
def read_csv(filename, hasHeader=False):
    data = []
    with open(filename) as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        if (hasHeader):
            next(reader, None)
            
        for row in reader:
            data.append(row)
    
    return data

In [2]:
items = read_csv('data/ItemInfo_train.csv', hasHeader=True)
print 'Train items', len(items)
print 'Item example', items[0]

Train items 3344613
Item example ['1', '81', '\xd0\x9f\xd1\x80\xd0\xbe\xd0\xb4\xd0\xb0\xd0\xbc \xd0\x9a\xd0\xb0\xd0\xbc\xd0\xb0\xd0\xb7 6520', '\xd0\x9f\xd1\x80\xd0\xbe\xd0\xb4\xd0\xb0\xd0\xbc \xd0\x9a\xd0\xb0\xd0\xbc\xd0\xb0\xd0\xb7 6520 20 \xd1\x82\xd0\xbe\xd0\xbd\xd0\xbd', '1064094, 5252822, 6645873, 6960145, 9230265', '{"\xd0\x92\xd0\xb8\xd0\xb4 \xd1\x82\xd0\xb5\xd1\x85\xd0\xbd\xd0\xb8\xd0\xba\xd0\xb8":"\xd0\x93\xd1\x80\xd1\x83\xd0\xb7\xd0\xbe\xd0\xb2\xd0\xb8\xd0\xba\xd0\xb8"}', '300000.0', '648140', '', '64.686946', '30.815924']


In [3]:
COL_ITEM_ID=0
COL_CATEGORY_ID=1
COL_TITLE=2
COL_DESCRIPTION=3
COL_IMAGES=4
COL_JSON=5
COL_PRICE=6
COL_LOCATION=7
COL_METRO=8
COL_LAT=9
COL_LON=10

In [4]:
map_items = dict()
for item in items:
    map_items[item[COL_ITEM_ID]] = item

In [5]:
pairs = read_csv('data/ItemPairs_train.csv', hasHeader=True)
pairs = pairs[:len(pairs)/30]
print 'Train pairs', len(pairs)
print 'Example pair', pairs[0]

Train pairs 99713
Example pair ['1', '4112648', '1', '1']


In [6]:
categories = read_csv('data/Category.csv')
print 'Categories', len(categories)

Categories 52


In [7]:
map_parent_category = dict()
for category in categories:
    map_parent_category[category[0]] = category[1]

In [8]:
locations = read_csv('data/Location.csv', hasHeader=True)
print 'Locations', len(locations)

Locations 3449


In [9]:
map_regions = dict()
for location in locations:
    map_regions[location[0]] = location[1]

In [10]:
def same_images(item1, item2):
    imgs1 = item1[COL_IMAGES].split(',')
    imgs2 = item2[COL_IMAGES].split(',')
    imgs1 = [x.strip() for x in imgs1]
    imgs2 = [x.strip() for x in imgs2]
    
    return set(imgs1) == set(imgs2)

def number_of_same_images(item1, item2):
    imgs1 = item1[COL_IMAGES].split(',')
    imgs2 = item2[COL_IMAGES].split(',')
    imgs1 = set([x.strip() for x in imgs1])
    imgs2 = set([x.strip() for x in imgs2])
    
    minlen = min(len(imgs1), len(imgs2))
    
    if minlen == 0:
        return 0
    
    return len(imgs1.intersection(imgs2)) / minlen

In [11]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize)

def cosine_sim(text1, text2):
    try:
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0,1]
    except:
        return 1.0 if text1 == text2 else 0.0

def same_title(item1, item2):
    return cosine_sim(item1[COL_TITLE], item2[COL_TITLE])

def same_description(item1, item2):
    return cosine_sim(item1[COL_DESCRIPTION], item2[COL_DESCRIPTION])

In [15]:
def same_category(item1, item2):
    cat1 = item1[COL_CATEGORY_ID]
    cat2 = item2[COL_CATEGORY_ID]
    
    return 1 if cat1 == cat2 else 0

def same_parent_category(item1, item2):
    cat1 = item1[COL_CATEGORY_ID]
    cat2 = item2[COL_CATEGORY_ID]
    
    par_cat1 = map_parent_category[cat1]
    par_cat2 = map_parent_category[cat2]
    
    return 1 if par_cat1 == par_cat2 else 0

def same_price(item1, item2):
    price1 = item1[COL_PRICE]
    price2 = item2[COL_PRICE]
    
    return 1 if price1 == price2 else 0

def safe_parse(str):
    try:
        return float(item1[COL_PRICE].replace(',', ''))
    except:
        return 0.0

def price_diff_percent(item1, item2):
    price1 = safe_parse(item1[COL_PRICE])
    price2 = safe_parse(item2[COL_PRICE])
    
    import math
    avg_price = (price1 + price2) / 2.0
    
    if avg_price != 0:
        return math.fabs(price1 - price2) / avg_price
    else:
        return 0

def same_lat(item1, item2):
    lat1 = item1[COL_LAT]
    lat2 = item2[COL_LAT]
    
    return 1 if lat1 == lat2 else 0

def same_lon(item1, item2):
    lon1 = item1[COL_LON]
    lon2 = item2[COL_LON]
    
    return 1 if lon1 == lon2 else 0

def same_location(item1, item2):
    location1 = item1[COL_LOCATION]
    location2 = item2[COL_LOCATION]
    
    return 1 if location1 == location2 else 0

def distance_between_coordinates(item1, item2):
    lat1 = float(item1[COL_LAT])
    lat2 = float(item2[COL_LAT])
    lon1 = float(item1[COL_LON])
    lon2 = float(item2[COL_LON])
    
    point1 = (lon1, lat1)
    point2 = (lon2, lat2)
    
    from geopy.distance import vincenty
    return vincenty(point1, point2).miles

def same_region(item1, item2):
    location1 = item1[COL_LOCATION]
    location2 = item2[COL_LOCATION]
    
    region1 = map_regions[location1]
    region2 = map_regions[location2]
    
    return 1 if region1 == region2 else 0

def same_metro(item1, item2):
    metro1 = item1[COL_METRO]
    metro2 = item2[COL_METRO]
    
    return 1 if metro1 == metro2 else 0

def get_features(item1, item2, label):
    fx = []
    fx.append(same_category(item1, item2))
    fx.append(same_parent_category(item1, item2))
    
    fx.append(same_price(item1, item2))
    fx.append(price_diff_percent(item1, item2))
    
    fx.append(same_lat(item1, item2))
    fx.append(same_lon(item1, item2))
    fx.append(same_location(item1, item2))
    fx.append(distance_between_coordinates(item1, item2))
    
    fx.append(same_region(item1, item2))
    fx.append(same_metro(item1, item2))
    
    fx.append(same_title(item1, item2))
    fx.append(same_description(item1, item2))
    
    fx.append(same_images(item1, item2))
    fx.append(number_of_same_images(item1, item2))
    
    return (fx, label)

In [18]:
item1 = map_items[pairs[0][0]]
item2 = map_items[pairs[0][1]]
%time price_diff_percent(item1, item2)
%time distance_between_coordinates(item1, item2)
%time same_title(item1, item2)
%time same_description(item1, item2)

%time get_features(item1, item2, 1)

CPU times: user 19 µs, sys: 1 µs, total: 20 µs
Wall time: 22.9 µs
CPU times: user 91 µs, sys: 8 µs, total: 99 µs
Wall time: 101 µs
CPU times: user 0 ns, sys: 4.01 ms, total: 4.01 ms
Wall time: 4.2 ms
CPU times: user 3.13 ms, sys: 4 µs, total: 3.14 ms
Wall time: 3.32 ms
CPU times: user 6.89 ms, sys: 0 ns, total: 6.89 ms
Wall time: 7.05 ms


([1,
  1,
  1,
  0.0,
  1,
  1,
  1,
  0.0,
  1,
  1,
  1.0000000000000002,
  0.99999999999999989,
  False,
  0],
 1)

In [19]:
def get_train_data():
    data = []
    for pair in pairs:
        item1 = map_items[pair[0]]
        item2 = map_items[pair[1]]
        label = int(pair[2])

        xy = get_features(item1, item2, label)
        data.append(xy)
        
    return data

train = get_train_data()
print train[0]

([1, 1, 1, 0.0, 1, 1, 1, 0.0, 1, 1, 1.0000000000000002, 0.99999999999999989, False, 0], 1)


In [None]:
from threading import Thread
datas = {}
def get_train_data_worker(i, n):
    data = []
    start = len(pairs) / n * i
    end = len(pairs) / n * (i+1)
    end = min(end, len(pairs))
    job = pairs[start:end]
    for pair in job:
        item1 = map_items[pair[0]]
        item2 = map_items[pair[1]]
        label = int(pair[2])

        xy = get_features(item1, item2, label)
        data.append(xy)
    datas[i] = data
    
def get_train_data_async():
    parts = 4
    
    threads = []
    for i in range(parts):
        thread=Thread(target=get_train_data_worker, args=(i+1, parts))
        threads.append(thread)
        thread.start()
    
    for thread in threads:
        thread.join()
    
    data = []
    for key in datas:
        data = data + datas[key]
        
    return data

train = get_train_data_async()
print train[0]

In [20]:
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(train, test_size=0.20, train_size=0.80)

train_x = [x[0] for x in train_data]
train_y = [x[1] for x in train_data]

test_x = [x[0] for x in test_data]
test_y = [x[1] for x in test_data]

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(train_x, train_y)

print classifier.score(test_x, test_y)
print classifier.feature_importances_

0.676728676729
[ 0.          0.          0.07517363  0.          0.00242139  0.00482038
  0.00659635  0.06950367  0.00408102  0.00346837  0.22377067  0.58964515
  0.01378844  0.00673094]
