In [6]:
import csv
def read_csv(filename, hasHeader=False):
    data = []
    with open(filename) as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        if (hasHeader):
            next(reader, None)
            
        for row in reader:
            data.append(row)
    
    return data

def write_csv(filename, header, data):
    with open(filename, 'w+') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"')
        if header is not None:
            writer.writerow(header)
            
        for row in data:
            writer.writerow(row)

In [7]:
COL_ITEM_ID=0
COL_CATEGORY_ID=1
COL_TITLE=2
COL_DESCRIPTION=3
COL_IMAGES=4
COL_JSON=5
COL_PRICE=6
COL_LOCATION=7
COL_METRO=8
COL_LAT=9
COL_LON=10

def load_items_map():
    items = read_csv('data/ItemInfo_train.csv', hasHeader=True)
    print('Train items', len(items))
    print('Item example', items[0])
    map_items = dict()
    for item in items:
        map_items[item[COL_ITEM_ID]] = item
    return map_items

In [8]:
def get_pairs():
    pairs = read_csv('data/ItemPairs_train.csv', hasHeader=True)
    print('Train pairs', len(pairs))
    print('Example pair', pairs[0])
    return pairs

In [10]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer


stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize)

def cosine_sim(text1, text2):
    try:
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0,1]
    except:
        return 1.0 if text1 == text2 else 0.0

In [12]:
items_map = load_items_map()
pairs = get_pairs()

Train items 3344613
Item example ['1', '81', 'Продам Камаз 6520', 'Продам Камаз 6520 20 тонн', '1064094, 5252822, 6645873, 6960145, 9230265', '{"Вид техники":"Грузовики"}', '300000.0', '648140', '', '64.686946', '30.815924']
Train pairs 2991396
Example pair ['1', '4112648', '1', '1']


In [14]:
result = []
for pair in pairs:
    item1 = items_map[pair[0]]
    item2 = items_map[pair[1]]
    
    sim = cosine_sim(item1[COL_TITLE], item2[COL_TITLE])
    
    result.append((item1[COL_ITEM_ID], item2[COL_ITEM_ID], sim))
write_csv('titles_sim.csv', None, result)

In [15]:
result = []
for pair in pairs:
    item1 = items_map[pair[0]]
    item1 = items_map[pair[1]]
    
    sim = cosine_sim(item1[COL_DESCRIPTION], item2[COL_DESCRIPTION])
    
    result.append((item1[COL_ITEM_ID], item2[COL_ITEM_ID], sim))
write_csv('description_sim.csv', None, result)