In [1]:
import datetime
import numpy as np
import pandas as pd
import re
#import preprocessing_tools as pr
import snowballstemmer
import itertools
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import Word2Vec
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error
import codecs
import seaborn as sns
import  matplotlib.pyplot as plt

%matplotlib inline


In [2]:
with open('clear_dataset.csv') as f:
    print(f)

<_io.TextIOWrapper name='clear_dataset.csv' mode='r' encoding='cp1251'>


In [3]:
raw_data = pd.read_csv('clear_dataset.csv', encoding = 'UTF-8')
raw_data.info()
raw_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212699 entries, 0 to 212698
Data columns (total 4 columns):
date      212699 non-null object
tittle    212699 non-null object
text      212699 non-null object
rate      212699 non-null int64
dtypes: int64(1), object(3)
memory usage: 6.5+ MB


Unnamed: 0,date,tittle,text,rate
0,2006-10-24 20:49:00,В США и России размещено больше всего сайтов с...,Организация Internet Watch Foundation в ходе ...,0
1,2006-11-30 12:07:00,«Цифровой раздел» между поколениями увеличивается,"21% американских родителей уверены в том, что...",-2
2,2006-09-05 08:32:00,Результаты Yahoo Answers вошли в основную выда...,Флагман социального поиска в интернете Yahoo ...,0
3,2006-07-01 21:45:00,«Артон консалтинг» исследовала контекстную рек...,Компания «Артон консалтинг» опубликовала иссл...,0
4,2007-01-19 11:14:00,Внутренний голос в RSS,У меня вот какая бяка постоянно вылезает. Есл...,0


In [4]:
# выкидываем пропущенные значения (в данном случае их нет)
print(raw_data.shape)
print(raw_data.dropna().shape)
columns = ['date', 'title', 'text', 'rate']
raw_data.columns = columns

(212699, 4)
(212699, 4)


In [5]:
raw_data.head()

Unnamed: 0,date,title,text,rate
0,2006-10-24 20:49:00,В США и России размещено больше всего сайтов с...,Организация Internet Watch Foundation в ходе ...,0
1,2006-11-30 12:07:00,«Цифровой раздел» между поколениями увеличивается,"21% американских родителей уверены в том, что...",-2
2,2006-09-05 08:32:00,Результаты Yahoo Answers вошли в основную выда...,Флагман социального поиска в интернете Yahoo ...,0
3,2006-07-01 21:45:00,«Артон консалтинг» исследовала контекстную рек...,Компания «Артон консалтинг» опубликовала иссл...,0
4,2007-01-19 11:14:00,Внутренний голос в RSS,У меня вот какая бяка постоянно вылезает. Есл...,0


# Feature engeneering

In [6]:
raw_data['date'] = pd.to_datetime(raw_data['date'])
raw_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212699 entries, 0 to 212698
Data columns (total 4 columns):
date     212699 non-null datetime64[ns]
title    212699 non-null object
text     212699 non-null object
rate     212699 non-null int64
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 6.5+ MB


In [7]:
#функции для получения временных данных
def get_year(x):
    year = x.year
    return year

def get_month(x):
    month = x.month
    return month

def get_day(x):
    day = x.day
    return day

def get_time_in_mins(x):
    time = x.time().hour * 60 + x.time().minute
    return time

def get_weekday(x):
    weekday = x.dayofweek
    return weekday

def lowercase(x):
    text = x
    return text.lower()

# вычисление числа ссылок
def links_count(text):
    number = len(re.findall(r'.ru|.com|.org|.net|.info|.biz|.io|.su|.рф|.me',text))
    return number

In [8]:
# токенизация и стемминг
stemmer = snowballstemmer.RussianStemmer()
rus_chars = set([chr(i) for i in range(1072, 1104)])

def mean_p(x):
    if not len(x):
        return -9999
    return np.mean(x)

def std_p(x):
    if not len(x):
        return -9999
    return np.std(x)

def tokenize_sent(sent):
    sent = sent.lower()
    sent = "".join([i if i in rus_chars else " " for i in sent])
    words = stemmer.stemWords(sent.split())
    return words

def clean_text(text):
    '''
    очистка текста
        
    на выходе - очищенный текст:
    удаляются цифры, знаки препинания, арифметические знаки
    
    '''
    
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip().strip('\n').strip('\r').strip('\t')

    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|[\"]", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())

    return text

In [12]:
print(raw_data.shape)
raw_data.head()

(212699, 10)


Unnamed: 0,date,title,text,rate,year,month,day,time,weekday,links_count
0,2006-10-24 20:49:00,в сша и россии размещено больше всего сайтов с...,организация internet watch foundation в ходе ...,0,2006,10,24,1249,1,2
1,2006-11-30 12:07:00,«цифровой раздел» между поколениями увеличивается,"21% американских родителей уверены в том, что...",-2,2006,11,30,727,3,1
2,2006-09-05 08:32:00,результаты yahoo answers вошли в основную выда...,флагман социального поиска в интернете yahoo ...,0,2006,9,5,512,1,0
3,2006-07-01 21:45:00,«артон консалтинг» исследовала контекстную рек...,компания «артон консалтинг» опубликовала иссл...,0,2006,7,1,1305,5,0
4,2007-01-19 11:14:00,внутренний голос в rss,у меня вот какая бяка постоянно вылезает. есл...,0,2007,1,19,674,4,1


## Fitting of titles 

In [6]:
%%time
raw_data['title'] = raw_data.title.apply(clean_text)




Wall time: 2.47 s


In [7]:
%%time
raw_data['title_tokenized'] = raw_data.title.apply(lambda x: [tokenize_sent(i) for i in x])

Wall time: 5min 21s


In [8]:
y = raw_data.rate
text_column = raw_data.text
title_column = raw_data.title
#title_tokenized = raw_data.title_tokenized

In [9]:
title_column_train, title_column_test, y_train, y_test = train_test_split(title_column, y, test_size = 0.33)

In [10]:
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), tokenizer = tokenize_sent)

In [11]:
%%time
title_column_tfidf_train = tfidf.fit_transform(title_column_train)

Wall time: 56 s


In [12]:
%%time
title_column_tfidf_test = tfidf.transform(title_column_test)

Wall time: 24.2 s


In [13]:
print(np.shape(title_column_tfidf_train))
print(y_train.shape)
print(np.shape(title_column_tfidf_test))
print(y_test.shape)

(142508, 50000)
(142508,)
(70191, 50000)
(70191,)


In [39]:
ridge = Ridge(random_state=17, alpha = 4, fit_intercept=True)

In [40]:
%%time
ridge.fit(title_column_tfidf_train, y_train);

Wall time: 1.28 s


Ridge(alpha=4, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=17, solver='auto', tol=0.001)

In [41]:
y_pred = ridge.predict(title_column_tfidf_test)

In [42]:
mean_absolute_error(y_test, y_pred)

20.63898375558921

### Cleared and tokenized titles

In [20]:
title_tokenized_train, title_tokenized_test, y_train, y_test = train_test_split(title_tokenized, y, test_size = 0.33)

In [21]:
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 3))

In [22]:
%%time
title_tokenized_tfidf_train = tfidf.fit_transform(title_tokenized_train)

AttributeError: 'list' object has no attribute 'lower'

## Fitting on texts

In [52]:
y = raw_data.rate
text_column = raw_data.text
title_column = raw_data.title

In [53]:
raw_data.columns

Index(['date', 'title', 'text', 'rate'], dtype='object')

In [54]:
%%time
text_column  = raw_data.title.apply(clean_text)

Wall time: 1.87 s


In [55]:
text_column_train, text_column_test, y_train, y_test = train_test_split(text_column, y, test_size = 0.33)

In [77]:
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), tokenizer = tokenize_sent)

In [78]:
%%time
text_column_tfidf_train = tfidf.fit_transform(text_column_train)

Wall time: 50.7 s


In [79]:
%%time
text_column_tfidf_test = tfidf.transform(text_column_test)

Wall time: 23.5 s


In [80]:
print(np.shape(text_column_tfidf_train))
print(y_train.shape)
print(np.shape(text_column_tfidf_test))
print(y_test.shape)

(142508, 50000)
(142508,)
(70191, 50000)
(70191,)


In [81]:
ridge = Ridge(random_state=17)

In [82]:
%%time
ridge.fit(text_column_tfidf_train, y_train);

Wall time: 1.48 s


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=17, solver='auto', tol=0.001)

In [83]:
y_pred = ridge.predict(text_column_tfidf_test)

In [84]:
mean_absolute_error(y_test, y_pred)

21.28120392544988

## Fitting on titles. LightGBM

### Hashing Vectorizer

In [7]:
y = raw_data.rate
text_column = raw_data.text
title_column = raw_data.title
#title_tokenized = raw_data.title_tokenized

In [8]:
title_column_train, title_column_test, y_train, y_test = train_test_split(title_column, y, test_size = 0.33)

In [11]:
hv = HashingVectorizer(dtype=np.float32,
            strip_accents='unicode', analyzer='word',
            ngram_range=(1, 4),n_features=2**12)

In [12]:
%%time
title_column_hashed_train = hv.fit_transform(title_column_train)

Wall time: 7.37 s


In [13]:
%%time
title_column_hashed_test = hv.transform(title_column_test)

Wall time: 1.89 s


In [15]:
print(np.shape(title_column_hashed_train))
print(y_train.shape)
print(np.shape(title_column_hashed_test))
print(y_test.shape)

(142508, 4096)
(142508,)
(70191, 4096)
(70191,)


In [16]:
param = {'num_trees': 100000, 'application':'regression',
         'learning_rate': 0.01, 'num_threads': 7, 'max_depth': 10,
         'lambda_l2': 1e-3}
param['metric'] = 'mae'

In [20]:
train_data = lgb.Dataset(title_column_hashed_train, label=y_train)
test_data = train_data.create_valid(title_column_hashed_test, label=y_test)

In [21]:
%%time 
bst = lgb.train(param, train_data, param['num_trees'], valid_sets=[test_data], early_stopping_rounds=50)



[1]	valid_0's l1: 21.6425
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l1: 21.6406
[3]	valid_0's l1: 21.6387
[4]	valid_0's l1: 21.6368
[5]	valid_0's l1: 21.6351
[6]	valid_0's l1: 21.6334
[7]	valid_0's l1: 21.6318
[8]	valid_0's l1: 21.6302
[9]	valid_0's l1: 21.6287
[10]	valid_0's l1: 21.627
[11]	valid_0's l1: 21.6256
[12]	valid_0's l1: 21.6242
[13]	valid_0's l1: 21.6228
[14]	valid_0's l1: 21.6215
[15]	valid_0's l1: 21.6201
[16]	valid_0's l1: 21.6189
[17]	valid_0's l1: 21.6177
[18]	valid_0's l1: 21.6164
[19]	valid_0's l1: 21.6152
[20]	valid_0's l1: 21.614
[21]	valid_0's l1: 21.6128
[22]	valid_0's l1: 21.6116
[23]	valid_0's l1: 21.6105
[24]	valid_0's l1: 21.6093
[25]	valid_0's l1: 21.6082
[26]	valid_0's l1: 21.6071
[27]	valid_0's l1: 21.606
[28]	valid_0's l1: 21.6048
[29]	valid_0's l1: 21.6036
[30]	valid_0's l1: 21.6027
[31]	valid_0's l1: 21.6018
[32]	valid_0's l1: 21.6009
[33]	valid_0's l1: 21.5999
[34]	valid_0's l1: 21.5989
[35]	valid_0's l1: 21.5981
[36]	v

[297]	valid_0's l1: 21.4903
[298]	valid_0's l1: 21.4901
[299]	valid_0's l1: 21.4898
[300]	valid_0's l1: 21.4895
[301]	valid_0's l1: 21.4893
[302]	valid_0's l1: 21.4891
[303]	valid_0's l1: 21.4888
[304]	valid_0's l1: 21.4886
[305]	valid_0's l1: 21.4884
[306]	valid_0's l1: 21.4881
[307]	valid_0's l1: 21.488
[308]	valid_0's l1: 21.4877
[309]	valid_0's l1: 21.4875
[310]	valid_0's l1: 21.4874
[311]	valid_0's l1: 21.4872
[312]	valid_0's l1: 21.487
[313]	valid_0's l1: 21.4869
[314]	valid_0's l1: 21.4866
[315]	valid_0's l1: 21.4864
[316]	valid_0's l1: 21.4861
[317]	valid_0's l1: 21.486
[318]	valid_0's l1: 21.4858
[319]	valid_0's l1: 21.4856
[320]	valid_0's l1: 21.4853
[321]	valid_0's l1: 21.4851
[322]	valid_0's l1: 21.485
[323]	valid_0's l1: 21.4848
[324]	valid_0's l1: 21.4844
[325]	valid_0's l1: 21.4842
[326]	valid_0's l1: 21.484
[327]	valid_0's l1: 21.4836
[328]	valid_0's l1: 21.4835
[329]	valid_0's l1: 21.4834
[330]	valid_0's l1: 21.4833
[331]	valid_0's l1: 21.483
[332]	valid_0's l1: 21.482

[591]	valid_0's l1: 21.4431
[592]	valid_0's l1: 21.443
[593]	valid_0's l1: 21.4429
[594]	valid_0's l1: 21.4427
[595]	valid_0's l1: 21.4427
[596]	valid_0's l1: 21.4425
[597]	valid_0's l1: 21.4424
[598]	valid_0's l1: 21.4423
[599]	valid_0's l1: 21.4421
[600]	valid_0's l1: 21.442
[601]	valid_0's l1: 21.4418
[602]	valid_0's l1: 21.4418
[603]	valid_0's l1: 21.4417
[604]	valid_0's l1: 21.4417
[605]	valid_0's l1: 21.4414
[606]	valid_0's l1: 21.4413
[607]	valid_0's l1: 21.4411
[608]	valid_0's l1: 21.4409
[609]	valid_0's l1: 21.4407
[610]	valid_0's l1: 21.4406
[611]	valid_0's l1: 21.4404
[612]	valid_0's l1: 21.4403
[613]	valid_0's l1: 21.4402
[614]	valid_0's l1: 21.4401
[615]	valid_0's l1: 21.4399
[616]	valid_0's l1: 21.4398
[617]	valid_0's l1: 21.4396
[618]	valid_0's l1: 21.4395
[619]	valid_0's l1: 21.4392
[620]	valid_0's l1: 21.4391
[621]	valid_0's l1: 21.4391
[622]	valid_0's l1: 21.4389
[623]	valid_0's l1: 21.4389
[624]	valid_0's l1: 21.4387
[625]	valid_0's l1: 21.4386
[626]	valid_0's l1: 21

[885]	valid_0's l1: 21.4117
[886]	valid_0's l1: 21.4116
[887]	valid_0's l1: 21.4115
[888]	valid_0's l1: 21.4114
[889]	valid_0's l1: 21.4115
[890]	valid_0's l1: 21.4113
[891]	valid_0's l1: 21.4112
[892]	valid_0's l1: 21.4112
[893]	valid_0's l1: 21.4111
[894]	valid_0's l1: 21.411
[895]	valid_0's l1: 21.4108
[896]	valid_0's l1: 21.4108
[897]	valid_0's l1: 21.4105
[898]	valid_0's l1: 21.4104
[899]	valid_0's l1: 21.4103
[900]	valid_0's l1: 21.4103
[901]	valid_0's l1: 21.4103
[902]	valid_0's l1: 21.4101
[903]	valid_0's l1: 21.41
[904]	valid_0's l1: 21.4099
[905]	valid_0's l1: 21.4098
[906]	valid_0's l1: 21.4097
[907]	valid_0's l1: 21.4096
[908]	valid_0's l1: 21.4096
[909]	valid_0's l1: 21.4094
[910]	valid_0's l1: 21.4093
[911]	valid_0's l1: 21.4093
[912]	valid_0's l1: 21.4091
[913]	valid_0's l1: 21.409
[914]	valid_0's l1: 21.4089
[915]	valid_0's l1: 21.4087
[916]	valid_0's l1: 21.4086
[917]	valid_0's l1: 21.4086
[918]	valid_0's l1: 21.4086
[919]	valid_0's l1: 21.4084
[920]	valid_0's l1: 21.4

[1175]	valid_0's l1: 21.3885
[1176]	valid_0's l1: 21.3886
[1177]	valid_0's l1: 21.3885
[1178]	valid_0's l1: 21.3884
[1179]	valid_0's l1: 21.3884
[1180]	valid_0's l1: 21.3884
[1181]	valid_0's l1: 21.3883
[1182]	valid_0's l1: 21.3883
[1183]	valid_0's l1: 21.3883
[1184]	valid_0's l1: 21.3883
[1185]	valid_0's l1: 21.3882
[1186]	valid_0's l1: 21.3881
[1187]	valid_0's l1: 21.388
[1188]	valid_0's l1: 21.388
[1189]	valid_0's l1: 21.388
[1190]	valid_0's l1: 21.388
[1191]	valid_0's l1: 21.388
[1192]	valid_0's l1: 21.3879
[1193]	valid_0's l1: 21.3879
[1194]	valid_0's l1: 21.3878
[1195]	valid_0's l1: 21.3877
[1196]	valid_0's l1: 21.3876
[1197]	valid_0's l1: 21.3875
[1198]	valid_0's l1: 21.3875
[1199]	valid_0's l1: 21.3875
[1200]	valid_0's l1: 21.3874
[1201]	valid_0's l1: 21.3873
[1202]	valid_0's l1: 21.3871
[1203]	valid_0's l1: 21.387
[1204]	valid_0's l1: 21.3869
[1205]	valid_0's l1: 21.3869
[1206]	valid_0's l1: 21.3868
[1207]	valid_0's l1: 21.3867
[1208]	valid_0's l1: 21.3867
[1209]	valid_0's l1:

[1742]	valid_0's l1: 21.3574
[1743]	valid_0's l1: 21.3574
[1744]	valid_0's l1: 21.3573
[1745]	valid_0's l1: 21.3572
[1746]	valid_0's l1: 21.3573
[1747]	valid_0's l1: 21.3572
[1748]	valid_0's l1: 21.3572
[1749]	valid_0's l1: 21.357
[1750]	valid_0's l1: 21.3569
[1751]	valid_0's l1: 21.3569
[1752]	valid_0's l1: 21.3569
[1753]	valid_0's l1: 21.3568
[1754]	valid_0's l1: 21.3569
[1755]	valid_0's l1: 21.3568
[1756]	valid_0's l1: 21.3568
[1757]	valid_0's l1: 21.3567
[1758]	valid_0's l1: 21.3567
[1759]	valid_0's l1: 21.3567
[1760]	valid_0's l1: 21.3567
[1761]	valid_0's l1: 21.3568
[1762]	valid_0's l1: 21.3567
[1763]	valid_0's l1: 21.3567
[1764]	valid_0's l1: 21.3567
[1765]	valid_0's l1: 21.3566
[1766]	valid_0's l1: 21.3566
[1767]	valid_0's l1: 21.3566
[1768]	valid_0's l1: 21.3565
[1769]	valid_0's l1: 21.3564
[1770]	valid_0's l1: 21.3565
[1771]	valid_0's l1: 21.3563
[1772]	valid_0's l1: 21.3563
[1773]	valid_0's l1: 21.3562
[1774]	valid_0's l1: 21.3562
[1775]	valid_0's l1: 21.3561
[1776]	valid_0'

[2028]	valid_0's l1: 21.3471
[2029]	valid_0's l1: 21.3471
[2030]	valid_0's l1: 21.3472
[2031]	valid_0's l1: 21.3471
[2032]	valid_0's l1: 21.3471
[2033]	valid_0's l1: 21.347
[2034]	valid_0's l1: 21.347
[2035]	valid_0's l1: 21.347
[2036]	valid_0's l1: 21.3471
[2037]	valid_0's l1: 21.3471
[2038]	valid_0's l1: 21.347
[2039]	valid_0's l1: 21.3469
[2040]	valid_0's l1: 21.3469
[2041]	valid_0's l1: 21.3468
[2042]	valid_0's l1: 21.3468
[2043]	valid_0's l1: 21.3468
[2044]	valid_0's l1: 21.3468
[2045]	valid_0's l1: 21.3467
[2046]	valid_0's l1: 21.3467
[2047]	valid_0's l1: 21.3467
[2048]	valid_0's l1: 21.3467
[2049]	valid_0's l1: 21.3468
[2050]	valid_0's l1: 21.3466
[2051]	valid_0's l1: 21.3467
[2052]	valid_0's l1: 21.3465
[2053]	valid_0's l1: 21.3465
[2054]	valid_0's l1: 21.3465
[2055]	valid_0's l1: 21.3465
[2056]	valid_0's l1: 21.3465
[2057]	valid_0's l1: 21.3465
[2058]	valid_0's l1: 21.3464
[2059]	valid_0's l1: 21.3464
[2060]	valid_0's l1: 21.3463
[2061]	valid_0's l1: 21.3463
[2062]	valid_0's l

[2313]	valid_0's l1: 21.3398
[2314]	valid_0's l1: 21.3398
[2315]	valid_0's l1: 21.3398
[2316]	valid_0's l1: 21.3398
[2317]	valid_0's l1: 21.3398
[2318]	valid_0's l1: 21.3398
[2319]	valid_0's l1: 21.3398
[2320]	valid_0's l1: 21.3395
[2321]	valid_0's l1: 21.3395
[2322]	valid_0's l1: 21.3394
[2323]	valid_0's l1: 21.3394
[2324]	valid_0's l1: 21.3395
[2325]	valid_0's l1: 21.3395
[2326]	valid_0's l1: 21.3395
[2327]	valid_0's l1: 21.3394
[2328]	valid_0's l1: 21.3393
[2329]	valid_0's l1: 21.3393
[2330]	valid_0's l1: 21.3393
[2331]	valid_0's l1: 21.3393
[2332]	valid_0's l1: 21.3393
[2333]	valid_0's l1: 21.3393
[2334]	valid_0's l1: 21.3393
[2335]	valid_0's l1: 21.3393
[2336]	valid_0's l1: 21.3393
[2337]	valid_0's l1: 21.3393
[2338]	valid_0's l1: 21.3393
[2339]	valid_0's l1: 21.3392
[2340]	valid_0's l1: 21.3393
[2341]	valid_0's l1: 21.3394
[2342]	valid_0's l1: 21.3394
[2343]	valid_0's l1: 21.3394
[2344]	valid_0's l1: 21.3394
[2345]	valid_0's l1: 21.3394
[2346]	valid_0's l1: 21.3394
[2347]	valid_0

[2598]	valid_0's l1: 21.3341
[2599]	valid_0's l1: 21.3341
[2600]	valid_0's l1: 21.3341
[2601]	valid_0's l1: 21.334
[2602]	valid_0's l1: 21.334
[2603]	valid_0's l1: 21.334
[2604]	valid_0's l1: 21.334
[2605]	valid_0's l1: 21.3341
[2606]	valid_0's l1: 21.3341
[2607]	valid_0's l1: 21.3341
[2608]	valid_0's l1: 21.3341
[2609]	valid_0's l1: 21.3341
[2610]	valid_0's l1: 21.3341
[2611]	valid_0's l1: 21.3341
[2612]	valid_0's l1: 21.3342
[2613]	valid_0's l1: 21.3342
[2614]	valid_0's l1: 21.3341
[2615]	valid_0's l1: 21.334
[2616]	valid_0's l1: 21.3341
[2617]	valid_0's l1: 21.334
[2618]	valid_0's l1: 21.334
[2619]	valid_0's l1: 21.334
[2620]	valid_0's l1: 21.3339
[2621]	valid_0's l1: 21.3339
[2622]	valid_0's l1: 21.3338
[2623]	valid_0's l1: 21.3338
[2624]	valid_0's l1: 21.3335
[2625]	valid_0's l1: 21.3335
[2626]	valid_0's l1: 21.3336
[2627]	valid_0's l1: 21.3335
[2628]	valid_0's l1: 21.3334
[2629]	valid_0's l1: 21.3333
[2630]	valid_0's l1: 21.3333
[2631]	valid_0's l1: 21.3333
[2632]	valid_0's l1: 2

[2883]	valid_0's l1: 21.3299
[2884]	valid_0's l1: 21.3299
[2885]	valid_0's l1: 21.3299
[2886]	valid_0's l1: 21.3299
[2887]	valid_0's l1: 21.3298
[2888]	valid_0's l1: 21.3298
[2889]	valid_0's l1: 21.3297
[2890]	valid_0's l1: 21.3298
[2891]	valid_0's l1: 21.3298
[2892]	valid_0's l1: 21.3298
[2893]	valid_0's l1: 21.3297
[2894]	valid_0's l1: 21.3296
[2895]	valid_0's l1: 21.3296
[2896]	valid_0's l1: 21.3295
[2897]	valid_0's l1: 21.3295
[2898]	valid_0's l1: 21.3297
[2899]	valid_0's l1: 21.3296
[2900]	valid_0's l1: 21.3295
[2901]	valid_0's l1: 21.3294
[2902]	valid_0's l1: 21.3294
[2903]	valid_0's l1: 21.3293
[2904]	valid_0's l1: 21.3293
[2905]	valid_0's l1: 21.3294
[2906]	valid_0's l1: 21.3294
[2907]	valid_0's l1: 21.3294
[2908]	valid_0's l1: 21.3295
[2909]	valid_0's l1: 21.3294
[2910]	valid_0's l1: 21.3294
[2911]	valid_0's l1: 21.3294
[2912]	valid_0's l1: 21.3294
[2913]	valid_0's l1: 21.3293
[2914]	valid_0's l1: 21.3293
[2915]	valid_0's l1: 21.3293
[2916]	valid_0's l1: 21.3293
[2917]	valid_0

[3168]	valid_0's l1: 21.326
[3169]	valid_0's l1: 21.3259
[3170]	valid_0's l1: 21.3259
[3171]	valid_0's l1: 21.3258
[3172]	valid_0's l1: 21.3258
[3173]	valid_0's l1: 21.3258
[3174]	valid_0's l1: 21.3258
[3175]	valid_0's l1: 21.3258
[3176]	valid_0's l1: 21.3258
[3177]	valid_0's l1: 21.3258
[3178]	valid_0's l1: 21.3258
[3179]	valid_0's l1: 21.3257
[3180]	valid_0's l1: 21.3257
[3181]	valid_0's l1: 21.3257
[3182]	valid_0's l1: 21.3258
[3183]	valid_0's l1: 21.3258
[3184]	valid_0's l1: 21.3257
[3185]	valid_0's l1: 21.3257
[3186]	valid_0's l1: 21.3257
[3187]	valid_0's l1: 21.3257
[3188]	valid_0's l1: 21.3256
[3189]	valid_0's l1: 21.3256
[3190]	valid_0's l1: 21.3257
[3191]	valid_0's l1: 21.3257
[3192]	valid_0's l1: 21.3258
[3193]	valid_0's l1: 21.3257
[3194]	valid_0's l1: 21.3258
[3195]	valid_0's l1: 21.3258
[3196]	valid_0's l1: 21.3256
[3197]	valid_0's l1: 21.3257
[3198]	valid_0's l1: 21.3257
[3199]	valid_0's l1: 21.3258
[3200]	valid_0's l1: 21.3258
[3201]	valid_0's l1: 21.3258
[3202]	valid_0'

[3453]	valid_0's l1: 21.3239
[3454]	valid_0's l1: 21.3237
[3455]	valid_0's l1: 21.3238
[3456]	valid_0's l1: 21.3238
[3457]	valid_0's l1: 21.3237
[3458]	valid_0's l1: 21.3237
[3459]	valid_0's l1: 21.3237
[3460]	valid_0's l1: 21.3236
[3461]	valid_0's l1: 21.3237
[3462]	valid_0's l1: 21.3237
[3463]	valid_0's l1: 21.3237
[3464]	valid_0's l1: 21.3237
[3465]	valid_0's l1: 21.3237
[3466]	valid_0's l1: 21.3237
[3467]	valid_0's l1: 21.3237
[3468]	valid_0's l1: 21.3237
[3469]	valid_0's l1: 21.3237
[3470]	valid_0's l1: 21.3237
[3471]	valid_0's l1: 21.3238
[3472]	valid_0's l1: 21.3238
[3473]	valid_0's l1: 21.3237
Early stopping, best iteration is:
[3423]	valid_0's l1: 21.3233
Wall time: 4min 35s


### subsampling

In [None]:
%%time
# признаки, связанные со временем, а также приведение к нижнему регистру и число ссылок
# текста статьи и заголовка
raw_data['year'] = raw_data.date.apply(get_year)
raw_data['month'] = raw_data.date.apply(get_month)
raw_data['day'] = raw_data.date.apply(get_day)
raw_data['time'] = raw_data.date.apply(get_time_in_mins)
raw_data['weekday'] = raw_data.date.apply(get_weekday)
raw_data['text'] = raw_data.text.apply(lowercase)
raw_data['title'] = raw_data.title.apply(lowercase)
raw_data['links_count'] = raw_data.text.apply(links_count)

In [38]:
y = raw_data.rate
X = raw_data.drop(['rate', 'date'], axis = 1)

In [39]:
X_sub_train, X_sub_test, y_sub_train, y_sub_test = train_test_split(X, y, test_size = 0.5)

In [40]:
text_column = X_sub_train.text
title_column = X_sub_train.title

In [45]:
print(X_sub_train.shape)
print(y_sub_train.shape)

(106349, 8)
(106349,)


### Tfidf Vectorizer

title_column_train, title_column_test, y_title_train, y_title_test = train_test_split(title_column, y_sub_train, test_size = 0.33)

In [47]:
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), tokenizer = tokenize_sent)

In [50]:
%%time
#title_column_tfidf_train = tfidf.fit_transform(title_column_train)
title_column_tfidf = tfidf.fit_transform(title_column_train)

Wall time: 27 s


%%time
title_column_tfidf_test = tfidf.transform(title_column_test)

In [21]:
param = {'num_trees': 100000, 'application':'regression',
         'learning_rate': 0.01, 'num_threads': 7, 'max_depth': 10,
         'lambda_l2': 1e-3}
param['metric'] = 'mae'

In [22]:
train_data = lgb.Dataset(title_column_tfidf_train, label=y_train)
test_data = train_data.create_valid(title_column_tfidf_test, label=y_test)

In [23]:
%%time 
bst = lgb.train(param, train_data, param['num_trees'], valid_sets=[test_data], early_stopping_rounds=50)



[1]	valid_0's l1: 21.4513
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l1: 21.4482
[3]	valid_0's l1: 21.4451
[4]	valid_0's l1: 21.4424
[5]	valid_0's l1: 21.4396
[6]	valid_0's l1: 21.4369
[7]	valid_0's l1: 21.4343
[8]	valid_0's l1: 21.4314
[9]	valid_0's l1: 21.429
[10]	valid_0's l1: 21.4262
[11]	valid_0's l1: 21.424
[12]	valid_0's l1: 21.4217
[13]	valid_0's l1: 21.4197
[14]	valid_0's l1: 21.4178
[15]	valid_0's l1: 21.4155
[16]	valid_0's l1: 21.4138
[17]	valid_0's l1: 21.4119
[18]	valid_0's l1: 21.4099
[19]	valid_0's l1: 21.4082
[20]	valid_0's l1: 21.4064
[21]	valid_0's l1: 21.4047
[22]	valid_0's l1: 21.4028
[23]	valid_0's l1: 21.4014
[24]	valid_0's l1: 21.3998
[25]	valid_0's l1: 21.3978
[26]	valid_0's l1: 21.3964
[27]	valid_0's l1: 21.3948
[28]	valid_0's l1: 21.3932
[29]	valid_0's l1: 21.3918
[30]	valid_0's l1: 21.3903
[31]	valid_0's l1: 21.3889
[32]	valid_0's l1: 21.3873
[33]	valid_0's l1: 21.3862
[34]	valid_0's l1: 21.3849
[35]	valid_0's l1: 21.3836
[36]	

[301]	valid_0's l1: 21.2411
[302]	valid_0's l1: 21.2408
[303]	valid_0's l1: 21.2407
[304]	valid_0's l1: 21.2404
[305]	valid_0's l1: 21.24
[306]	valid_0's l1: 21.2398
[307]	valid_0's l1: 21.2395
[308]	valid_0's l1: 21.2394
[309]	valid_0's l1: 21.2388
[310]	valid_0's l1: 21.2384
[311]	valid_0's l1: 21.2383
[312]	valid_0's l1: 21.238
[313]	valid_0's l1: 21.2379
[314]	valid_0's l1: 21.2374
[315]	valid_0's l1: 21.2373
[316]	valid_0's l1: 21.237
[317]	valid_0's l1: 21.237
[318]	valid_0's l1: 21.2366
[319]	valid_0's l1: 21.236
[320]	valid_0's l1: 21.2357
[321]	valid_0's l1: 21.2355
[322]	valid_0's l1: 21.2355
[323]	valid_0's l1: 21.2352
[324]	valid_0's l1: 21.2347
[325]	valid_0's l1: 21.2346
[326]	valid_0's l1: 21.2342
[327]	valid_0's l1: 21.2338
[328]	valid_0's l1: 21.2339
[329]	valid_0's l1: 21.2336
[330]	valid_0's l1: 21.2334
[331]	valid_0's l1: 21.233
[332]	valid_0's l1: 21.2326
[333]	valid_0's l1: 21.2324
[334]	valid_0's l1: 21.232
[335]	valid_0's l1: 21.2318
[336]	valid_0's l1: 21.2315


[601]	valid_0's l1: 21.177
[602]	valid_0's l1: 21.177
[603]	valid_0's l1: 21.1772
[604]	valid_0's l1: 21.177
[605]	valid_0's l1: 21.1769
[606]	valid_0's l1: 21.1766
[607]	valid_0's l1: 21.1765
[608]	valid_0's l1: 21.1763
[609]	valid_0's l1: 21.1761
[610]	valid_0's l1: 21.176
[611]	valid_0's l1: 21.1759
[612]	valid_0's l1: 21.1757
[613]	valid_0's l1: 21.1756
[614]	valid_0's l1: 21.1752
[615]	valid_0's l1: 21.1753
[616]	valid_0's l1: 21.1751
[617]	valid_0's l1: 21.1749
[618]	valid_0's l1: 21.1746
[619]	valid_0's l1: 21.1744
[620]	valid_0's l1: 21.174
[621]	valid_0's l1: 21.1739
[622]	valid_0's l1: 21.1736
[623]	valid_0's l1: 21.1735
[624]	valid_0's l1: 21.1736
[625]	valid_0's l1: 21.1735
[626]	valid_0's l1: 21.1734
[627]	valid_0's l1: 21.1731
[628]	valid_0's l1: 21.1729
[629]	valid_0's l1: 21.1729
[630]	valid_0's l1: 21.1729
[631]	valid_0's l1: 21.1727
[632]	valid_0's l1: 21.1727
[633]	valid_0's l1: 21.1726
[634]	valid_0's l1: 21.1727
[635]	valid_0's l1: 21.1728
[636]	valid_0's l1: 21.17

[897]	valid_0's l1: 21.1391
[898]	valid_0's l1: 21.1388
[899]	valid_0's l1: 21.1388
[900]	valid_0's l1: 21.1388
[901]	valid_0's l1: 21.1388
[902]	valid_0's l1: 21.139
[903]	valid_0's l1: 21.1391
[904]	valid_0's l1: 21.1389
[905]	valid_0's l1: 21.1388
[906]	valid_0's l1: 21.1386
[907]	valid_0's l1: 21.1384
[908]	valid_0's l1: 21.1382
[909]	valid_0's l1: 21.1381
[910]	valid_0's l1: 21.1379
[911]	valid_0's l1: 21.1378
[912]	valid_0's l1: 21.1378
[913]	valid_0's l1: 21.1376
[914]	valid_0's l1: 21.1372
[915]	valid_0's l1: 21.1371
[916]	valid_0's l1: 21.137
[917]	valid_0's l1: 21.1369
[918]	valid_0's l1: 21.137
[919]	valid_0's l1: 21.1371
[920]	valid_0's l1: 21.1367
[921]	valid_0's l1: 21.1365
[922]	valid_0's l1: 21.1361
[923]	valid_0's l1: 21.136
[924]	valid_0's l1: 21.1358
[925]	valid_0's l1: 21.1358
[926]	valid_0's l1: 21.1355
[927]	valid_0's l1: 21.1353
[928]	valid_0's l1: 21.1351
[929]	valid_0's l1: 21.1349
[930]	valid_0's l1: 21.1347
[931]	valid_0's l1: 21.1349
[932]	valid_0's l1: 21.1

[1186]	valid_0's l1: 21.1139
[1187]	valid_0's l1: 21.1137
[1188]	valid_0's l1: 21.1135
[1189]	valid_0's l1: 21.1134
[1190]	valid_0's l1: 21.1134
[1191]	valid_0's l1: 21.1132
[1192]	valid_0's l1: 21.1131
[1193]	valid_0's l1: 21.113
[1194]	valid_0's l1: 21.1129
[1195]	valid_0's l1: 21.1129
[1196]	valid_0's l1: 21.1129
[1197]	valid_0's l1: 21.1128
[1198]	valid_0's l1: 21.1126
[1199]	valid_0's l1: 21.1124
[1200]	valid_0's l1: 21.1122
[1201]	valid_0's l1: 21.1123
[1202]	valid_0's l1: 21.1121
[1203]	valid_0's l1: 21.1119
[1204]	valid_0's l1: 21.1119
[1205]	valid_0's l1: 21.112
[1206]	valid_0's l1: 21.1117
[1207]	valid_0's l1: 21.1118
[1208]	valid_0's l1: 21.1118
[1209]	valid_0's l1: 21.1117
[1210]	valid_0's l1: 21.1116
[1211]	valid_0's l1: 21.1117
[1212]	valid_0's l1: 21.1116
[1213]	valid_0's l1: 21.1114
[1214]	valid_0's l1: 21.1112
[1215]	valid_0's l1: 21.1113
[1216]	valid_0's l1: 21.1113
[1217]	valid_0's l1: 21.1112
[1218]	valid_0's l1: 21.1112
[1219]	valid_0's l1: 21.1112
[1220]	valid_0's

[1470]	valid_0's l1: 21.096
[1471]	valid_0's l1: 21.096
[1472]	valid_0's l1: 21.0959
[1473]	valid_0's l1: 21.0958
[1474]	valid_0's l1: 21.0959
[1475]	valid_0's l1: 21.0958
[1476]	valid_0's l1: 21.0956
[1477]	valid_0's l1: 21.0955
[1478]	valid_0's l1: 21.0954
[1479]	valid_0's l1: 21.0952
[1480]	valid_0's l1: 21.0951
[1481]	valid_0's l1: 21.0951
[1482]	valid_0's l1: 21.0952
[1483]	valid_0's l1: 21.0952
[1484]	valid_0's l1: 21.0951
[1485]	valid_0's l1: 21.0948
[1486]	valid_0's l1: 21.0949
[1487]	valid_0's l1: 21.0948
[1488]	valid_0's l1: 21.0947
[1489]	valid_0's l1: 21.0946
[1490]	valid_0's l1: 21.0946
[1491]	valid_0's l1: 21.0946
[1492]	valid_0's l1: 21.0945
[1493]	valid_0's l1: 21.0945
[1494]	valid_0's l1: 21.0946
[1495]	valid_0's l1: 21.0945
[1496]	valid_0's l1: 21.0946
[1497]	valid_0's l1: 21.0945
[1498]	valid_0's l1: 21.0946
[1499]	valid_0's l1: 21.0945
[1500]	valid_0's l1: 21.0946
[1501]	valid_0's l1: 21.0945
[1502]	valid_0's l1: 21.0944
[1503]	valid_0's l1: 21.0942
[1504]	valid_0's

[1756]	valid_0's l1: 21.0802
[1757]	valid_0's l1: 21.0801
[1758]	valid_0's l1: 21.0801
[1759]	valid_0's l1: 21.08
[1760]	valid_0's l1: 21.08
[1761]	valid_0's l1: 21.08
[1762]	valid_0's l1: 21.08
[1763]	valid_0's l1: 21.0799
[1764]	valid_0's l1: 21.0798
[1765]	valid_0's l1: 21.0798
[1766]	valid_0's l1: 21.0797
[1767]	valid_0's l1: 21.0795
[1768]	valid_0's l1: 21.0795
[1769]	valid_0's l1: 21.0796
[1770]	valid_0's l1: 21.0793
[1771]	valid_0's l1: 21.0793
[1772]	valid_0's l1: 21.0793
[1773]	valid_0's l1: 21.0794
[1774]	valid_0's l1: 21.0794
[1775]	valid_0's l1: 21.0793
[1776]	valid_0's l1: 21.0793
[1777]	valid_0's l1: 21.0793
[1778]	valid_0's l1: 21.0792
[1779]	valid_0's l1: 21.0793
[1780]	valid_0's l1: 21.0792
[1781]	valid_0's l1: 21.0791
[1782]	valid_0's l1: 21.079
[1783]	valid_0's l1: 21.079
[1784]	valid_0's l1: 21.0789
[1785]	valid_0's l1: 21.0789
[1786]	valid_0's l1: 21.0787
[1787]	valid_0's l1: 21.0787
[1788]	valid_0's l1: 21.0787
[1789]	valid_0's l1: 21.0788
[1790]	valid_0's l1: 21.

[2045]	valid_0's l1: 21.0715
[2046]	valid_0's l1: 21.0715
[2047]	valid_0's l1: 21.0715
[2048]	valid_0's l1: 21.0715
[2049]	valid_0's l1: 21.0714
[2050]	valid_0's l1: 21.0714
[2051]	valid_0's l1: 21.0713
[2052]	valid_0's l1: 21.0714
[2053]	valid_0's l1: 21.0715
[2054]	valid_0's l1: 21.0713
[2055]	valid_0's l1: 21.0712
[2056]	valid_0's l1: 21.0713
[2057]	valid_0's l1: 21.0712
[2058]	valid_0's l1: 21.0714
[2059]	valid_0's l1: 21.0713
[2060]	valid_0's l1: 21.0713
[2061]	valid_0's l1: 21.0711
[2062]	valid_0's l1: 21.0712
[2063]	valid_0's l1: 21.0712
[2064]	valid_0's l1: 21.0712
[2065]	valid_0's l1: 21.0712
[2066]	valid_0's l1: 21.0711
[2067]	valid_0's l1: 21.071
[2068]	valid_0's l1: 21.071
[2069]	valid_0's l1: 21.071
[2070]	valid_0's l1: 21.071
[2071]	valid_0's l1: 21.0709
[2072]	valid_0's l1: 21.071
[2073]	valid_0's l1: 21.071
[2074]	valid_0's l1: 21.071
[2075]	valid_0's l1: 21.0712
[2076]	valid_0's l1: 21.071
[2077]	valid_0's l1: 21.0708
[2078]	valid_0's l1: 21.0707
[2079]	valid_0's l1: 2

[2336]	valid_0's l1: 21.0646
[2337]	valid_0's l1: 21.0644
[2338]	valid_0's l1: 21.0644
[2339]	valid_0's l1: 21.0645
[2340]	valid_0's l1: 21.0643
[2341]	valid_0's l1: 21.0642
[2342]	valid_0's l1: 21.0641
[2343]	valid_0's l1: 21.0641
[2344]	valid_0's l1: 21.064
[2345]	valid_0's l1: 21.064
[2346]	valid_0's l1: 21.064
[2347]	valid_0's l1: 21.0641
[2348]	valid_0's l1: 21.064
[2349]	valid_0's l1: 21.0639
[2350]	valid_0's l1: 21.064
[2351]	valid_0's l1: 21.064
[2352]	valid_0's l1: 21.0639
[2353]	valid_0's l1: 21.0638
[2354]	valid_0's l1: 21.0638
[2355]	valid_0's l1: 21.0638
[2356]	valid_0's l1: 21.0638
[2357]	valid_0's l1: 21.0636
[2358]	valid_0's l1: 21.0636
[2359]	valid_0's l1: 21.0636
[2360]	valid_0's l1: 21.0636
[2361]	valid_0's l1: 21.0634
[2362]	valid_0's l1: 21.0634
[2363]	valid_0's l1: 21.0633
[2364]	valid_0's l1: 21.0634
[2365]	valid_0's l1: 21.0635
[2366]	valid_0's l1: 21.0635
[2367]	valid_0's l1: 21.0635
[2368]	valid_0's l1: 21.0633
[2369]	valid_0's l1: 21.0634
[2370]	valid_0's l1:

## Fitting on texts

### LightGBM and TFIDF Vectorizer

In [23]:
text_column_train, text_column_test, y_train, y_test = train_test_split(text_column, y_sub_train, test_size = 0.33)

In [26]:
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 2), tokenizer = tokenize_sent)

In [27]:
%%time
text_column_tfidf_train = tfidf.fit_transform(text_column_train)

Wall time: 53min 17s


In [28]:
%%time
text_column_tfidf_test = tfidf.transform(text_column_test)

Wall time: 28min 8s


In [29]:
param = {'num_trees': 100000, 'application':'regression',
         'learning_rate': 0.01, 'num_threads': 7, 'max_depth': 10,
         'lambda_l2': 1e-3}
param['metric'] = 'mae'

In [47]:
train_data = lgb.Dataset(text_column_tfidf_train, label=y_train)
test_data = train_data.create_valid(text_column_tfidf_test, label=y_test)

In [48]:
%%time 
bst = lgb.train(param, train_data, param['num_trees'], valid_sets=[test_data], early_stopping_rounds=50)



[1]	valid_0's l1: 21.371
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l1: 21.3714
[3]	valid_0's l1: 21.3717
[4]	valid_0's l1: 21.3721
[5]	valid_0's l1: 21.3724
[6]	valid_0's l1: 21.3726
[7]	valid_0's l1: 21.373
[8]	valid_0's l1: 21.3732
[9]	valid_0's l1: 21.3737
[10]	valid_0's l1: 21.3738
[11]	valid_0's l1: 21.3742
[12]	valid_0's l1: 21.3745
[13]	valid_0's l1: 21.3748
[14]	valid_0's l1: 21.3751
[15]	valid_0's l1: 21.3753
[16]	valid_0's l1: 21.3755
[17]	valid_0's l1: 21.3758
[18]	valid_0's l1: 21.3762
[19]	valid_0's l1: 21.3763
[20]	valid_0's l1: 21.3766
[21]	valid_0's l1: 21.3767
[22]	valid_0's l1: 21.377
[23]	valid_0's l1: 21.3769
[24]	valid_0's l1: 21.3773
[25]	valid_0's l1: 21.3776
[26]	valid_0's l1: 21.3781
[27]	valid_0's l1: 21.3784
[28]	valid_0's l1: 21.3787
[29]	valid_0's l1: 21.3788
[30]	valid_0's l1: 21.3792
[31]	valid_0's l1: 21.3794
[32]	valid_0's l1: 21.3797
[33]	valid_0's l1: 21.3799
[34]	valid_0's l1: 21.3803
[35]	valid_0's l1: 21.3807
[36]	v

### Fitting on TF-IDF Vectorized texts and titles

In [44]:
text_column_tfidf_train_df = pd.DataFrame(text_column_tfidf_train)
text_column_tfidf_test_df = pd.DataFrame(text_column_tfidf_test)
title_column_tfidf_test_df = pd.DataFrame(title_column_tfidf_test)
title_column_tfidf_train_df = pd.DataFrame(title_column_tfidf_train)

In [45]:
X_text_title_train = pd.concat([text_column_tfidf_train_df, title_column_tfidf_train_df], axis=1)
X_text_title_test = pd.concat([text_column_tfidf_test_df, title_column_tfidf_test_df], axis=1)

In [46]:
param = {'num_trees': 100000, 'application':'regression',
         'learning_rate': 0.01, 'num_threads': 7, 'max_depth': 10,
         'lambda_l2': 1e-3}
param['metric'] = 'mae'

In [30]:
train_data = lgb.Dataset(X_text_title_train, label=y_train)
test_data = train_data.create_valid(X_text_title_test, label=y_test)

### Mixing Vectorized titles and other features

In [51]:
time_feats = ['year', 'month', 'day', 'time', 'weekday', 'text', 'title', 'links_count']

In [52]:
print(X_feats_compose_df.shape)
print(y_sub_train.shape)

(142008, 9)
(106349,)


In [53]:
X_feats_compose_df.head()

Unnamed: 0,year,month,day,time,weekday,text,title,links_count,0
0,2006.0,10.0,24.0,1249.0,1.0,организация internet watch foundation в ходе ...,в сша и россии размещено больше всего сайтов с...,2.0,"(0, 28290)\t0.36538921860281814\n (0, 42489..."
1,,,,,,,,,"(0, 24133)\t0.38271242760319873\n (0, 37781..."
2,,,,,,,,,"(0, 40370)\t0.438447190793032\n (0, 26091)\..."
3,,,,,,,,,"(0, 8753)\t1.0"
4,,,,,,,,,"(0, 46545)\t0.4302819000116315\n (0, 9807)\..."


In [43]:
X_feats_compose_df = pd.DataFrame(X_sub_train[time_feats])
#X_feats_compose_train_df = pd.DataFrame(X_sub_train[time_feats])
#X_feats_compose_test_df = pd.DataFrame(X_sub_test[time_feats])
title_column_tfidf_train_df = pd.DataFrame(title_column_tfidf_train)
#title_column_tfidf_test_df = pd.DataFrame(title_column_tfidf_test)
X_feats_compose_df = pd.concat([X_sub_train[time_feats], title_column_tfidf_train_df], axis = 1)
#X_feats_compose_test_df = pd.concat([X_sub_test[time_feats], title_column_tfidf_test_df], axis = 1)

In [44]:
X_feats_compose_train_df, X_feats_compose_test_df, y_feats_compose_train, y_feats_compose_test = train_test_split(X_feats_compose_df,
                                                                                                                  y_sub_train, test_size = 0.3) 

ValueError: Found input variables with inconsistent numbers of samples: [142008, 106349]

In [32]:
param = {'num_trees': 100000, 'application':'regression',
         'learning_rate': 0.01, 'num_threads': 7, 'max_depth': 10,
         'lambda_l2': 1e-3}
param['metric'] = 'mae'

In [None]:
train_data = lgb.Dataset(X_feats_compose_train_df, label=y_train)
test_data = train_data.create_valid(X_feats_compose_test_df, label=y_test)

препроцессинг: удалить все формулы и математические знаки
Признаки: сам текст, длина текста (количество слов), длину заголовка тоже учитывать
Использовать различные ворд эмбединги: bag-of-words, word2vec
Можно использовать структуру cookie-cutter DS для красоты структуры проекта

In [None]:
%%time
data['text'] = data.text.apply(clean_text)
data['tittle'] = data.tittle.apply(clean_text)
data['text_tokenized'] = data.text.apply(lambda x: [tokenize_sent(i) for i in x])
data['tittle_tokenized'] = data.tittle.apply(tokenize_sent)



In [None]:
%%time
data["text_sizes"] = raw_data.text.apply(lambda x: [len(i) for i in x])
data["text_sizes_mean"] = raw_data.text_sizes.apply(mean_p)
data["text_sizes_std"] = raw_data.text_sizes.apply(std_p)
data["text_words_count"] = raw_data.text_tokenized.apply(lambda x: len(list(itertools.chain(*x))))
data["tittle_words_count"] = raw_data.tittle_tokenized.apply(lambda x: len(x))
