In [5]:
import pandas as pd
import  random
from tqdm.notebook import tqdm
from functools import reduce
from gensim.models import Word2Vec
from scipy import spatial

In [2]:
class DataFrameDesc:
    def __init__(self, filepath_or_buffer, sep, usecols, names, chunksize, total):
        self.filepath_or_buffer = filepath_or_buffer
        self.sep = sep
        self.usecols = usecols
        self.names = names
        self.chunksize = chunksize
        self.total = total


class DataFrameIterable:
    def __init__(self, descriptor):
        self.descriptor = descriptor

    def __iter__(self):
        print(f'Reading "{self.descriptor.filepath_or_buffer}"')
        print(f'Total: {self.descriptor.total}')
        print(f'Chunk size: {self.descriptor.chunksize}')
        for _, df in tqdm(iterable=enumerate(pd.read_csv(filepath_or_buffer=self.descriptor.filepath_or_buffer,
                                                         sep=self.descriptor.sep,
                                                         usecols=self.descriptor.usecols,
                                                         names=self.descriptor.names,
                                                         chunksize=self.descriptor.chunksize)),
                          total=self.descriptor.total // self.descriptor.chunksize):
            df = df.fillna('')
            for _, row in df.iterrows():
                yield tuple([row[name] for name in self.descriptor.names])


class CompoundDataFrameIterable:
    def __init__(self, iterables):
        self.iterables = iterables

    def __iter__(self):
        for iterable in self.iterables:
            yield from iterable



class CorpusIterable:
    def __init__(self, string_list_iter):
        self.string_list_iter = string_list_iter

    def __iter__(self):
        for string_list_data in self.string_list_iter:
            yield reduce(lambda x, y: x + y.split(), string_list_data[1:], [])

In [3]:
chunksize = 100000
train_df_desc = DataFrameDesc('clicks_10M', '\t', [0, 1, 2], ['_', 'q', 'r'], chunksize, total=int(1e7))
test_df_desc  = DataFrameDesc('clicks_test_', '\t', [0, 1, 2], ['_', 'q', 'r'], chunksize, total=int(1e6 // 2))

In [4]:
model = Word2Vec(min_count=10, sample=1e-3)
model.build_vocab(corpus_iterable=CorpusIterable(
    CompoundDataFrameIterable([
        DataFrameIterable(train_df_desc),
        DataFrameIterable(test_df_desc)
    ])
))
model.train(corpus_iterable=CorpusIterable(
    CompoundDataFrameIterable([
        DataFrameIterable(train_df_desc),
        DataFrameIterable(test_df_desc)
    ])
), total_examples=model.corpus_count, epochs=3)

Reading "clicks_10M"
Total: 10000000
Chunk size: 100000
Reading "clicks_test_"
Total: 500000
Chunk size: 100000
Reading "clicks_10M"
Total: 10000000
Chunk size: 100000
Reading "clicks_test_"
Total: 500000
Chunk size: 100000
Reading "clicks_10M"
Total: 10000000
Chunk size: 100000
Reading "clicks_test_"
Total: 500000
Chunk size: 100000
Reading "clicks_10M"
Total: 10000000
Chunk size: 100000
Reading "clicks_test_"
Total: 500000
Chunk size: 100000


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

(390192994, 417966930)

In [6]:
scores = []
for data in DataFrameIterable(test_df_desc):
    query = list(filter(lambda x: x in model.wv.key_to_index, data[1].split()))
    title = list(filter(lambda x: x in model.wv.key_to_index, data[2].split()))
    if len(query) == 0 or len(title) == 0:
        scores.append(random.uniform(0, 1))
        continue
    v1 = sum(model.wv[x] for x in query) / len(query)
    v2 = sum(model.wv[x] for x in title) / len(title)
    scores.append(1 - spatial.distance.cosine(v1, v2) / 2.0)

Reading "clicks_test_"
Total: 500000
Chunk size: 100000


  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
with open('res_1.csv', 'w') as f:
    f.write('Id,Predicted\n')
    for i, p in enumerate(scores):
        f.write(f'{i},{p:0.6f}\n')