In [1]:
from __future__ import division, print_function
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

from gensim.models import word2vec

In [2]:
class mean_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(next(iter(w2v.values())))
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec.get(w, np.zeros(self.dim)) for w in words], axis=0)
            for words in X
        ])

In [3]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input
from keras.preprocessing.text import Tokenizer
from keras import regularizers

Using TensorFlow backend.


In [6]:
train_df = pd.read_csv('rating/train_content.csv', parse_dates=['date'])
test_df = pd.read_csv('rating/test_content.csv', parse_dates=['date'])

In [7]:
from sklearn.metrics import mean_squared_error
import re
import pymorphy2
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/yulits/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
morph = pymorphy2.MorphAnalyzer()
stops = set(stopwords.words('english')) | set(stopwords.words('russian'))

def review_to_wordlist(review):
    review_text = re.sub("^A-Za-zА-Яа-я", " ", review)
    words = review_text.lower().split()
    words = [w for w in words if not w in stops]
    words = [morph.parse(w)[0].normal_form for w in words]
    return words

In [9]:
train_df['year'] = train_df['date'].apply(lambda x: x.year)
train_df['month'] = train_df['date'].apply(lambda x: x.month)

In [10]:
X_train = train_df[train_df['year'] == 2015]
X_val = train_df[(train_df['year'] == 2016) & (train_df['month'] <= 4)]
y_train = X_train['favs_lognorm']
y_val = X_val['favs_lognorm']
X_train.shape, X_val.shape, y_train.mean(), y_val.mean()

((23425, 15), (7556, 15), 3.4046228249071526, 3.304679829935242)

In [11]:
data = pd.concat([X_train, X_val], axis=0,  ignore_index=True)
data['content_clear'] = data['content'].apply(str)

In [242]:
%%time
data['content_clear'] = data['content_clear'].apply(review_to_wordlist)

CPU times: user 45min 27s, sys: 1.55 s, total: 45min 29s
Wall time: 45min 29s


In [243]:
import pickle
with open('content_clear.pickle', 'wb') as f:
    pickle.dump(data, f)

In [12]:
import pickle
with open('content_clear.pickle', 'rb') as f:
    data = 

data['content'][0]
pickle.load(f)

In [16]:
data['content_clear'][0]

['<img',
 'src="https://habrastorage.org/files/50e/211/9a0/50e2119a0508439ba77d8eb2dc2234ce.jpg"',
 'alt="pricing"',
 'align="left">большинство',
 'предприниматель',
 'сосредотачиваться',
 'создание',
 'инновационный',
 'продукта,',
 'забывают,',
 'элегантный',
 'решение',
 'автоматически',
 'превращаться',
 'успешный',
 'бизнес.',
 'компания',
 'требоваться',
 'столь',
 'элегантный',
 'бизнес-модель,',
 'правильный',
 'ценовый',
 'политикой,',
 'канал',
 'сбыт',
 'взаимоотношение',
 'клиентами.<br>',
 '<br>',
 'поиск',
 'правильный',
 'бизнес-модель',
 'требовать',
 'такой',
 'усердия,',
 'проектирование',
 'правильный',
 'продукта,',
 'подход',
 'требовать',
 'навык',
 'отличаются.',
 'поэтому',
 'инвестор',
 'признают,',
 'соучредитель',
 'лучше,',
 'один.',
 'время',
 'сосредоточиться',
 'разработка',
 'продукта,',
 'два',
 'сделать',
 'упор',
 'поиск',
 'разработка',
 'бизнес-модели.',
 'оба',
 'аспект',
 'должный',
 'выполняться',
 'параллельно.<br>',
 '<br>',
 'управленческий',


In [39]:
dim_size = 300
word2vec_model = word2vec.Word2Vec(data['content_clear'], size=dim_size, window=10, workers=8)

w2v = dict(zip(word2vec_model.wv.index2word, word2vec_model.wv.syn0))

In [40]:
word2vec_model.wv.most_similar(positive=['open', 'data', 'science', 'best'])

[('computer', 0.8404650092124939),
 ('engineering', 0.8337506651878357),
 ('analysis', 0.8262729048728943),
 ('knowledge', 0.8219612240791321),
 ('learning', 0.8161517977714539),
 ('intelligence', 0.8151358962059021),
 ('artificial', 0.806818425655365),
 ('computational', 0.8048728108406067),
 ('science,', 0.8027878999710083),
 ('solutions', 0.8009922504425049)]

In [132]:
data_mean = mean_vectorizer(w2v).fit(data['content_clear']).transform(data['content_clear'])
data_mean.shape

(30981, 300)

In [23]:
def split(train, y, ratio):
    idx = ratio
    return train[:idx, :], train[idx:, :], y[:idx], y[idx:]

y = data['favs_lognorm']
X_train, X_val, y_train, y_val = split(data_mean, y, 23425)
X_train.shape, X_val.shape, np.mean(y_train), np.mean(y_val)

((23425, 300), (7556, 300), 3.4046228249071526, 3.304679829935242)

In [133]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=1, random_state=17)
model.fit(X_train, y_train)
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)
y_med = np.ones(len(preds_val)) * y_train.median()
print('Train error: ', mean_squared_error(y_train, preds_train))
print('Validation error: ', mean_squared_error(y_val, preds_val))
print('Median prediction validation error: ', mean_squared_error(y_val, y_med))

Train error:  0.9713214136821583
Validation error:  0.8587660188788371
Median prediction validation error:  1.4460163851213332


In [134]:
def baseline_model():
    model = Sequential()
    model.add(Dense(128, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, kernel_initializer='normal'))
    optimizer = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    #optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=None, decay=0.0)
#     optimizer = optimizers.SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='mean_squared_error',
              optimizer=optimizer)
    return model

estimator = KerasRegressor(build_fn=baseline_model, 
                           epochs=28, nb_epoch=7, batch_size=64,
                           validation_data=(X_val, y_val),
                           verbose=2)
estimator.fit(X_train, y_train)

Train on 23425 samples, validate on 7556 samples
Epoch 1/28
 - 1s - loss: 3.3646 - val_loss: 1.4830
Epoch 2/28
 - 1s - loss: 2.1676 - val_loss: 1.3921
Epoch 3/28
 - 1s - loss: 2.0442 - val_loss: 1.3426
Epoch 4/28
 - 1s - loss: 1.9593 - val_loss: 1.3141
Epoch 5/28
 - 1s - loss: 1.8797 - val_loss: 1.2595
Epoch 6/28
 - 1s - loss: 1.8077 - val_loss: 1.2041
Epoch 7/28
 - 1s - loss: 1.7365 - val_loss: 1.1633
Epoch 8/28
 - 1s - loss: 1.7086 - val_loss: 1.1278
Epoch 9/28
 - 1s - loss: 1.6342 - val_loss: 1.1158
Epoch 10/28
 - 1s - loss: 1.5896 - val_loss: 1.0635
Epoch 11/28
 - 1s - loss: 1.5650 - val_loss: 1.0217
Epoch 12/28
 - 1s - loss: 1.5044 - val_loss: 1.0025
Epoch 13/28
 - 1s - loss: 1.4723 - val_loss: 0.9855
Epoch 14/28
 - 1s - loss: 1.4663 - val_loss: 0.9530
Epoch 15/28
 - 1s - loss: 1.4040 - val_loss: 0.9382
Epoch 16/28
 - 1s - loss: 1.3730 - val_loss: 0.9183
Epoch 17/28
 - 1s - loss: 1.3501 - val_loss: 0.8987
Epoch 18/28
 - 1s - loss: 1.3050 - val_loss: 0.8773
Epoch 19/28
 - 1s - loss

<keras.callbacks.History at 0x7fd6cb2f8400>

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

class tfidf_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(self.word2vec.values())))
        
    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x:x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
#         print('max_idf: ', max_idf)
        self.word2weight = dict(zip(vectorizer.get_feature_names(), tfidf.idf_))
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec.get(w, np.zeros(self.dim)) * self.word2weight.get(w, np.zeros(self.dim)) 
                    for w in words], axis=0)
            for words in X
        ])
        

In [108]:
data_mean_tfidf = tfidf_vectorizer(w2v).fit(data['content_clear']).transform(data['content_clear'])

max_idf:  10.648014489106362


In [109]:
type(data_mean_tfidf)

numpy.ndarray

In [110]:
def split(train, y, ratio):
    idx = ratio
    return train[:idx, :], train[idx:, :], y[:idx], y[idx:]

y = data['favs_lognorm']
X_train, X_val, y_train, y_val = split(data_mean_tfidf, y, 23425)
X_train.shape, X_val.shape, np.mean(y_train), np.mean(y_val)

((23425, 300), (7556, 300), 3.4046228249071526, 3.304679829935242)

In [130]:
model = Ridge(alpha=1, random_state=17)
model.fit(X_train, y_train)
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)
y_med = np.ones(len(preds_val)) * y_train.median()
print('Train error: ', mean_squared_error(y_train, preds_train))
print('Validation error: ', mean_squared_error(y_val, preds_val))
print('Median prediction validation error: ', mean_squared_error(y_val, y_med))

Train error:  0.9713214136821583
Validation error:  0.8587660188788371
Median prediction validation error:  1.4460163851213332


In [119]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras import optimizers

In [131]:
def baseline_model():
    model = Sequential()
    model.add(Dense(128, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, kernel_initializer='normal'))
    optimizer = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    #optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=None, decay=0.0)
#     optimizer = optimizers.SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='mean_squared_error',
              optimizer=optimizer)
    return model

estimator = KerasRegressor(build_fn=baseline_model, 
                           epochs=28, nb_epoch=7, batch_size=64,
                           validation_data=(X_val, y_val),
                           verbose=2)
estimator.fit(X_train, y_train)

Train on 23425 samples, validate on 7556 samples
Epoch 1/28
 - 1s - loss: 3.6244 - val_loss: 1.4975
Epoch 2/28
 - 1s - loss: 2.2653 - val_loss: 1.3919
Epoch 3/28
 - 1s - loss: 2.1148 - val_loss: 1.3471
Epoch 4/28
 - 1s - loss: 2.0057 - val_loss: 1.3201
Epoch 5/28
 - 1s - loss: 1.9548 - val_loss: 1.2585
Epoch 6/28
 - 1s - loss: 1.9048 - val_loss: 1.2525
Epoch 7/28
 - 1s - loss: 1.8718 - val_loss: 1.1932
Epoch 8/28
 - 1s - loss: 1.7992 - val_loss: 1.1587
Epoch 9/28
 - 1s - loss: 1.7700 - val_loss: 1.1469
Epoch 10/28
 - 1s - loss: 1.7196 - val_loss: 1.0822
Epoch 11/28
 - 1s - loss: 1.6603 - val_loss: 1.0545
Epoch 12/28
 - 1s - loss: 1.6332 - val_loss: 1.0481
Epoch 13/28
 - 1s - loss: 1.5757 - val_loss: 0.9993
Epoch 14/28
 - 1s - loss: 1.5453 - val_loss: 0.9699
Epoch 15/28
 - 1s - loss: 1.5050 - val_loss: 0.9520
Epoch 16/28
 - 1s - loss: 1.4557 - val_loss: 0.9278
Epoch 17/28
 - 1s - loss: 1.4428 - val_loss: 0.9109
Epoch 18/28
 - 1s - loss: 1.4038 - val_loss: 0.9219
Epoch 19/28
 - 1s - loss

<keras.callbacks.History at 0x7fd6cbd2a518>

In [114]:
import xgboost as xgb

In [115]:
d_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
d_val = xgb.DMatrix(X_val, label=y_val, missing=np.nan)
watchlist = [(d_train, 'train'), (d_val, 'test')]
history = dict()

In [116]:
params = {
    'max_depth': 26,
    'eta': 0.025,
    'nthread': 8,
    'gamma' : 1,
    'lambda' : 1,
    'subsample': 0.85,
    'eval_metric': ['rmse'],
    'objective': 'reg:linear',
    'colsample_bytree': 0.9,
    'min_child_weight': 100,
    'scale_pos_weight':(1)/y.mean(),
    'seed':7,
    'silent': 1
}

In [117]:
model_new = xgb.train(params,
                     d_train,
                     num_boost_round=200,
                     evals=watchlist,
                     evals_result=history,
                     verbose_eval=20,
                     )

[0]	train-rmse:3.19147	test-rmse:3.05002
[20]	train-rmse:3.17675	test-rmse:3.0357
[40]	train-rmse:3.16235	test-rmse:3.02194
[60]	train-rmse:3.15544	test-rmse:3.01584
[80]	train-rmse:3.14742	test-rmse:3.0083
[100]	train-rmse:3.14385	test-rmse:3.00517
[120]	train-rmse:3.13858	test-rmse:3.00036
[140]	train-rmse:3.13741	test-rmse:2.99952
[160]	train-rmse:3.13322	test-rmse:2.99545
[180]	train-rmse:3.1317	test-rmse:2.99403
[199]	train-rmse:3.13188	test-rmse:2.99431
