In [1]:
import emoji
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import fastText
import numpy as np
import joblib

In [2]:
train = pd.read_csv('../data/unzipped/train.csv')

In [3]:
def remove_emoji(src_str):
    return ''.join(c for c in src_str if c not in emoji.UNICODE_EMOJI)

In [4]:
def clean_text(text):
    replaced_text = re.sub(r'[【】]', ' ', text) 
    replaced_text = re.sub(r'[",]', '', text) 
    replaced_text = re.sub(r'[（）()]', ' ', replaced_text)
    replaced_text = re.sub(r'[［］\[\]]', ' ', replaced_text)
    replaced_text = re.sub(r'[@＠]\w\+', '', replaced_text)
    replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text)
    replaced_text = re.sub(r'[　]\t', ' ', replaced_text)
    replaced_text = re.sub(r'[\r\n]', ' ', replaced_text)
    replaced_text = re.sub(r'[0-9]', '0', replaced_text)
    return replaced_text

# train description

In [5]:
train['description'].fillna('неизвестный', inplace=True)
train['title'].fillna('неизвестный', inplace=True)
train['title'] = train.title.apply(lambda x: clean_text(x))
train['title'] = train.title.apply(lambda x: remove_emoji(x))
train['description'] = train.description.apply(lambda x: clean_text(x))
train['description'] = train.description.apply(lambda x: remove_emoji(x))

In [6]:
model = fastText.load_model('../data/pretrained_model/wiki.ru.bin')

In [7]:
description_vec = []
for i, description in tqdm(train.description.iteritems()):
    vector_list = []
    for word in description.split():
        vector_list.append(model.get_word_vector(word))
        vector = np.array(vector_list)
    description_vec.append(vector)




In [8]:
joblib.dump(description_vec, '../data/features/train_description_vec.gz', compress=3)

['../data/features/train_description_vec.gz']

# test description

In [9]:
test = pd.read_csv('../data/unzipped/test.csv')
test['description'].fillna('неизвестный', inplace=True)
test['title'].fillna('неизвестный', inplace=True)
test['title'] = test.title.apply(lambda x: clean_text(x))
test['title'] = test.title.apply(lambda x: remove_emoji(x))
test['description'] = test.description.apply(lambda x: clean_text(x))
test['description'] = test.description.apply(lambda x: remove_emoji(x))

In [10]:
test_description_vec = []
for i, description in tqdm(test.description.iteritems()):
    vector_list = []
    for word in description.split():
        vector_list.append(model.get_word_vector(word))
        vector = np.array(vector_list)
    test_description_vec.append(vector)




In [11]:
joblib.dump(test_description_vec, '../data/features/test_description_vec.gz', compress=3)

['../data/features/test_description_vec.gz']

# title feature

In [12]:
test_title_vec = []
for i, title in tqdm(test.title.iteritems()):
    vector_list = []
    for word in title.split():
        vector_list.append(model.get_word_vector(word))
        vector = np.array(vector_list)
    test_title_vec.append(vector)




In [13]:
train_title_vec = []
for i, title in tqdm(train.title.iteritems()):
    vector_list = []
    for word in title.split():
        vector_list.append(model.get_word_vector(word))
        vector = np.array(vector_list)
    train_title_vec.append(vector)




In [14]:
joblib.dump(test_title_vec, '../data/features/test_title_vec.gz', compress=3)
joblib.dump(train_title_vec, '../data/features/train_title_vec.gz', compress=3)

['../data/features/train_title_vec.gz']

In [15]:
train.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби кокон для сна,Кокон для сна малышапользовались меньше месяца...,400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,Стойка для одежды под вешалки. С бутика.,3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,В хорошем состоянии домашний кинотеатр с blu r...,4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-00кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,ВАЗ 0000 0000,Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


# 文章の平均ベクトルを取る

In [16]:
description_vec = []
for i, description in tqdm(train.description.iteritems()):
    vector_list = []
    vector = model.get_sentence_vector(description)
    description_vec.append(vector)




In [17]:
title_vec = []
for i, description in tqdm(train.title.iteritems()):
    vector_list = []
    vector = model.get_sentence_vector(description)
    title_vec.append(vector)




In [18]:
joblib.dump(title_vec, '../data/features/train_title_sentence_vec.gz', compress=3)
joblib.dump(description_vec, '../data/features/train_description_sentence_vec.gz', compress=3)

['../data/features/train_description_sentence_vec.gz']

In [19]:
test = pd.read_csv('../data/unzipped/test.csv')
test['description'].fillna('неизвестный', inplace=True)
test['title'].fillna('неизвестный', inplace=True)
test['description'] = train.description.apply(lambda x: clean_text(x))
test['description'] = train.description.apply(lambda x: remove_emoji(x))

In [20]:
description_vec = []
for i, description in tqdm(test.description.iteritems()):
    vector_list = []
    vector = model.get_sentence_vector(description)
    description_vec.append(vector)




In [21]:
title_vec = []
for i, description in tqdm(test.title.iteritems()):
    vector_list = []
    vector = model.get_sentence_vector(description)
    title_vec.append(vector)




In [22]:
joblib.dump(title_vec, '../data/features/test_title_sentence_vec.gz', compress=3)
joblib.dump(description_vec, '../data/features/test_description_sentence_vec.gz', compress=3)

['../data/features/test_description_sentence_vec.gz']