In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [2]:
config = {
    "database": "postgres",
    "user": "postgres",
    "password": "test-task",
    "host": "52.15.186.119",
    "port": 5431
}

In [3]:
import psycopg2

con = psycopg2.connect(
  database="postgres", 
  user="postgres", 
  password="test-task", 
  host="ec2-52-15-186-119.us-east-2.compute.amazonaws.com", 
  port="8213"
)

In [4]:
cursor = con.cursor()

In [5]:
cursor.execute("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'public'""")
for table in cursor.fetchall():
    print(table)

('offers',)
('pairs',)


In [6]:
import pandas as pd
import pandas.io.sql as sqlio

In [7]:
sql = 'select * from offers'
df_offers = sqlio.read_sql_query(sql, con)

In [8]:
import spacy
nlp = spacy.load('../spacy-ru/ru2/', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

In [9]:
def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 0:
        return ' '.join(txt)

In [10]:
brief_cleaning = [re.sub("[^A-Za-zА-Яа-я']+", ' ', str(row)) for row in df_offers['description']]

In [11]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=4)]

In [12]:
len(txt)

19266

In [13]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(15062, 1)

In [14]:
import multiprocessing

from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [15]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [16]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [17]:
from collections import defaultdict
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

34801

In [18]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['дом',
 'квартира',
 'комната',
 'метр',
 'кв_метр',
 'два',
 'этаж',
 'кухня',
 'новый',
 'рядом']

In [19]:
w2v_model = Word2Vec(min_count=10,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [20]:
from time import time
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.09 mins


In [21]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 2.02 mins


In [22]:
w2v_model.init_sims(replace=True)

In [23]:
w2v_model.wv.most_similar(positive=["площадь"])

[('общий_площадь', 0.5675261616706848),
 ('составлять', 0.5435560941696167),
 ('кв_метр', 0.5107544660568237),
 ('этажей', 0.5035864114761353),
 ('площадь_кв', 0.46922940015792847),
 ('санузлов', 0.46279478073120117),
 ('параметры', 0.45745810866355896),
 ('общ_пл', 0.45715653896331787),
 ('квадратный', 0.4557541608810425),
 ('пл', 0.42672351002693176)]

In [25]:
w2v_model.wv.save_word2vec_format('../models/wv300cian.model')