In [1]:
import csv
import codecs
def read_csv(filename, hasHeader=False):
    data = []
    with open(filename) as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        if (hasHeader):
            next(reader, None)
            
        for row in reader:
            data.append(row)
    
    return data

In [2]:
items = read_csv('data/ItemInfo_train.csv', hasHeader=True)
print('Train items', len(items))
print('Item example', items[0])

Train items 3344613
Item example ['1', '81', 'Продам Камаз 6520', 'Продам Камаз 6520 20 тонн', '1064094, 5252822, 6645873, 6960145, 9230265', '{"Вид техники":"Грузовики"}', '300000.0', '648140', '', '64.686946', '30.815924']


In [2]:
COL_ITEM_ID=0
COL_CATEGORY_ID=1
COL_TITLE=2
COL_DESCRIPTION=3
COL_IMAGES=4
COL_JSON=5
COL_PRICE=6
COL_LOCATION=7
COL_METRO=8
COL_LAT=9
COL_LON=10

In [4]:
map_items = dict()
for item in items:
    map_items[item[COL_ITEM_ID]] = item

In [5]:
pairs = read_csv('data/ItemPairs_train.csv', hasHeader=True)
pairs = pairs[:20000]
print('Train pairs', len(pairs))
print('Example pair', pairs[0])

Train pairs 20000
Example pair ['1', '4112648', '1', '1']


In [3]:
categories = read_csv('data/Category.csv')
print('Categories', len(categories))

Categories 52


In [4]:
map_parent_category = dict()
for category in categories:
    map_parent_category[category[0]] = category[1]

In [5]:
locations = read_csv('data/Location.csv', hasHeader=True)
print('Locations', len(locations))

Locations 3449


In [6]:
map_regions = dict()
for location in locations:
    map_regions[location[0]] = location[1]

In [7]:
map_hashes = {}

for i in range(10):
    data = read_csv('processed_data/hashes' + str(i) + '.csv')
    for row in data:
        map_hashes[row[0]] = row[1]

In [8]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize)

def cosine_sim(text1, text2):
    try:
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0,1]
    except:
        return 1.0 if text1 == text2 else 0.0

def same_title(item1, item2):
    return cosine_sim(item1[COL_TITLE], item2[COL_TITLE])

def same_description(item1, item2):
    return cosine_sim(item1[COL_DESCRIPTION], item2[COL_DESCRIPTION])

def number_of_same_words_title(item1, item2):
    title1 = set(normalize(item1[COL_TITLE]))
    title2 = set(normalize(item2[COL_TITLE]))
    
    return len(title1.intersection(title2))

def number_of_same_words_title_rel(item1, item2):
    title1 = set(normalize(item1[COL_TITLE]))
    title2 = set(normalize(item2[COL_TITLE]))
    m = min(len(title1), len(title2))
    if m == 0:
        return 0
    
    return len(title1.intersection(title2)) / m

def number_of_same_words_desc(item1, item2):
    title1 = set(normalize(item1[COL_DESCRIPTION]))
    title2 = set(normalize(item2[COL_DESCRIPTION]))
    
    return len(title1.intersection(title2))

def number_of_same_words_desc_rel(item1, item2):
    title1 = set(normalize(item1[COL_DESCRIPTION]))
    title2 = set(normalize(item2[COL_DESCRIPTION]))
    m = min(len(title1), len(title2))
    if m == 0:
        return 0
    
    return len(title1.intersection(title2)) / m

In [20]:
def number_of_same_images(item1, item2):
    imgs1 = item1[COL_IMAGES].split(',')
    imgs2 = item2[COL_IMAGES].split(',')
    imgs1 = set(filter(bool, [x.strip() for x in imgs1]))
    imgs2 = set(filter(bool, [x.strip() for x in imgs2]))
    
    count = 0
    for img1 in imgs1:
        for img2 in imgs2:
            try:
                if map_hashes[img1+'.jpg'] == map_hashes[img2+'.jpg']:
                    count += 1
                    break
            except KeyError as e:
                print(e)
                
    minlen = min(len(imgs1), len(imgs2))
    
    if minlen == 0:
        return (count, 0)
    
    return (count, count / minlen)

In [10]:
import math

def same_category(item1, item2):
    cat1 = item1[COL_CATEGORY_ID]
    cat2 = item2[COL_CATEGORY_ID]
    
    return 1 if cat1 == cat2 else 0

def same_parent_category(item1, item2):
    cat1 = item1[COL_CATEGORY_ID]
    cat2 = item2[COL_CATEGORY_ID]
    
    par_cat1 = map_parent_category[cat1]
    par_cat2 = map_parent_category[cat2]
    
    return 1 if par_cat1 == par_cat2 else 0

def same_price(item1, item2):
    price1 = item1[COL_PRICE]
    price2 = item2[COL_PRICE]
    
    return 1 if price1 == price2 else 0

def safe_parse(str):
    try:
        return float(item1[COL_PRICE].replace(',', ''))
    except:
        return 0.0
    
def price_diff(item1, item2):
    price1 = safe_parse(item1[COL_PRICE])
    price2 = safe_parse(item2[COL_PRICE])
    
    diff = math.fabs(price1 - price2)
    avg_price = min(price1, price2)
    
    if avg_price == 0:
        return (diff, 0)
    
    return (diff, diff / avg_price)

def same_lat(item1, item2):
    lat1 = item1[COL_LAT]
    lat2 = item2[COL_LAT]
    
    return 1 if lat1 == lat2 else 0

def same_lon(item1, item2):
    lon1 = item1[COL_LON]
    lon2 = item2[COL_LON]
    
    return 1 if lon1 == lon2 else 0

def same_location(item1, item2):
    location1 = item1[COL_LOCATION]
    location2 = item2[COL_LOCATION]
    
    return 1 if location1 == location2 else 0

def distance_between_coordinates(item1, item2):
    lat1 = float(item1[COL_LAT])
    lat2 = float(item2[COL_LAT])
    lon1 = float(item1[COL_LON])
    lon2 = float(item2[COL_LON])
    
    point1 = (lon1, lat1)
    point2 = (lon2, lat2)
    
    from geopy.distance import vincenty
    return vincenty(point1, point2).miles

def same_region(item1, item2):
    location1 = item1[COL_LOCATION]
    location2 = item2[COL_LOCATION]
    
    region1 = map_regions[location1]
    region2 = map_regions[location2]
    
    return 1 if region1 == region2 else 0

def same_metro(item1, item2):
    metro1 = item1[COL_METRO]
    metro2 = item2[COL_METRO]
    
    return 1 if metro1 == metro2 else 0

def get_features(item1, item2, label):
    fx = []
    fx.append(same_category(item1, item2))
    fx.append(same_parent_category(item1, item2))
    
    (price_diff_abs, price_diff_rel) = price_diff(item1, item2)
    fx.append(price_diff_abs)
    fx.append(price_diff_rel)
    
    fx.append(same_lat(item1, item2))
    fx.append(same_lon(item1, item2))
    fx.append(same_location(item1, item2))
    fx.append(distance_between_coordinates(item1, item2))
    
    fx.append(same_region(item1, item2))
    fx.append(same_metro(item1, item2))
    
#     fx.append(same_title(item1, item2))
    fx.append(number_of_same_words_title_rel(item1, item2))

#     fx.append(same_description(item1, item2))
    fx.append(number_of_same_words_desc_rel(item1, item2))
    
    (img_sim_abs, img_sim_rel) = number_of_same_images(item1, item2)
    fx.append(img_sim_abs)
    fx.append(img_sim_rel)
    
    return (fx, label)

In [14]:
item1 = map_items[pairs[0][0]]
item2 = map_items[pairs[0][1]]

%time get_features(item1, item2, 1)

CPU times: user 20.4 ms, sys: 8.03 ms, total: 28.4 ms
Wall time: 325 ms


([1, 1, 0.0, 0.0, 1, 1, 1, 0.0, 1, 1, 1.0, 1.0, 4, 1.0], 1)

In [15]:
def get_train_data(items):
    data = []
    for pair in items:
        item1 = map_items[pair[0]]
        item2 = map_items[pair[1]]
        label = int(pair[2])

        xy = get_features(item1, item2, label)
        data.append(xy)
        
    return data

train = get_train_data(pairs)
print(train[0])

([1, 1, 0.0, 0.0, 1, 1, 1, 0.0, 1, 1, 1.0, 1.0, 4, 1.0], 1)


In [None]:
# from threading import Thread
# datas = {}
# def get_train_data_worker(i, n):
#     data = []
#     start = len(pairs) / n * i
#     end = len(pairs) / n * (i+1)
#     end = min(end, len(pairs))
#     job = pairs[start:end]
#     for pair in job:
#         item1 = map_items[pair[0]]
#         item2 = map_items[pair[1]]
#         label = int(pair[2])

#         xy = get_features(item1, item2, label)
#         data.append(xy)
#     datas[i] = data
    
# def get_train_data_async():
#     parts = 4
    
#     threads = []
#     for i in range(parts):
#         thread=Thread(target=get_train_data_worker, args=(i+1, parts))
#         threads.append(thread)
#         thread.start()
    
#     for thread in threads:
#         thread.join()
    
#     data = []
#     for key in datas:
#         data = data + datas[key]
        
#     return data

# train = get_train_data_async()
# print(train[0])

In [16]:
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(train, test_size=0.20, train_size=0.80)

train_x = [x[0] for x in train_data]
train_y = [x[1] for x in train_data]

test_x = [x[0] for x in test_data]
test_y = [x[1] for x in test_data]

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

classifier = RandomForestClassifier(
    n_estimators=50,
    max_depth=10
)
# classifier = SVC()
classifier.fit(train_x, train_y)

print(classifier.score(test_x, test_y))
# print(classifier.feature_importances_)

0.78925


In [18]:
from sklearn.externals import joblib

joblib.dump(classifier, 'model/rfc.pkl') 

['rfc.pkl',
 'rfc.pkl_01.npy',
 'rfc.pkl_02.npy',
 'rfc.pkl_03.npy',
 'rfc.pkl_04.npy',
 'rfc.pkl_05.npy',
 'rfc.pkl_06.npy',
 'rfc.pkl_07.npy',
 'rfc.pkl_08.npy',
 'rfc.pkl_09.npy',
 'rfc.pkl_10.npy',
 'rfc.pkl_11.npy',
 'rfc.pkl_12.npy',
 'rfc.pkl_13.npy',
 'rfc.pkl_14.npy',
 'rfc.pkl_15.npy',
 'rfc.pkl_16.npy',
 'rfc.pkl_17.npy',
 'rfc.pkl_18.npy',
 'rfc.pkl_19.npy',
 'rfc.pkl_20.npy',
 'rfc.pkl_21.npy',
 'rfc.pkl_22.npy',
 'rfc.pkl_23.npy',
 'rfc.pkl_24.npy',
 'rfc.pkl_25.npy',
 'rfc.pkl_26.npy',
 'rfc.pkl_27.npy',
 'rfc.pkl_28.npy',
 'rfc.pkl_29.npy',
 'rfc.pkl_30.npy',
 'rfc.pkl_31.npy',
 'rfc.pkl_32.npy',
 'rfc.pkl_33.npy',
 'rfc.pkl_34.npy',
 'rfc.pkl_35.npy',
 'rfc.pkl_36.npy',
 'rfc.pkl_37.npy',
 'rfc.pkl_38.npy',
 'rfc.pkl_39.npy',
 'rfc.pkl_40.npy',
 'rfc.pkl_41.npy',
 'rfc.pkl_42.npy',
 'rfc.pkl_43.npy',
 'rfc.pkl_44.npy',
 'rfc.pkl_45.npy',
 'rfc.pkl_46.npy',
 'rfc.pkl_47.npy',
 'rfc.pkl_48.npy',
 'rfc.pkl_49.npy',
 'rfc.pkl_50.npy',
 'rfc.pkl_51.npy',
 'rfc.pkl_52.npy',


In [11]:
from sklearn.externals import joblib

classifier = joblib.load('model/rfc.pkl') 

# TEST data

In [12]:
test_pairs = read_csv('data/ItemPairs_test.csv', hasHeader=True)
print('Train pairs', len(test_pairs))
print('Example pair', test_pairs[0])

Train pairs 1044196
Example pair ['0', '5', '4670875']


In [18]:
test_items = read_csv('data/ItemInfo_test.csv', hasHeader=True)
print('Test items', len(test_items))
print('Item example', test_items[0])

map_test_items = dict()
for item in test_items:
    map_test_items[item[COL_ITEM_ID]] = item

Test items 1315205
Item example ['5', '115', 'Сотрудничество салонам кухонной мебели', 'Сотрудничество салонам кухонной мебели.\nТребуются заказы на ремонт и отделку помещений кухни для установки ВАШЕЙ кухонной мебели.\nДополнительные вопросы по телефону.', '', '{"Вид услуги":"Другое"}', '', '637640', '500292.0', '55.760211', '37.577211']


In [21]:
def get_test_data(items):
    data = []
    for pair in items:
        item1 = map_test_items[pair[1]]
        item2 = map_test_items[pair[2]]

        xy = get_features(item1, item2, 0)
        data.append(xy)
        
    return data

test = get_test_data(test_pairs)
print(test[0])

test_f = [x[0] for x in test]

test_l = classifier.predict_proba(test_f)


'13717141.jpg'
'12961761.jpg'
'12961761.jpg'
'12961761.jpg'
'12961761.jpg'
'9943584.jpg'
'12953041.jpg'
'4515613.jpg'
'13717141.jpg'
([1, 1, 0.0, 0, 1, 1, 1, 0.0, 1, 1, 0.25, 0.2, 0, 0], 0)


In [22]:
print(test_l[0])

[ 0.93811403  0.06188597]


In [24]:
print(classifier.classes_)

[0 1]


In [27]:
result = [(x[0], y[1]) for x,y in zip(test_pairs, test_l)]

In [28]:
print(result[0])

('0', 0.061885970146551825)


In [29]:
import csv
def write_csv(filename, header, data):
    with open(filename, 'w+') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"')
        if header is not None:
            writer.writerow(header)
            
        for row in data:
            writer.writerow(row)

In [30]:
write_csv('submission.csv', ['id', 'probability'], result)