In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import scipy
import lightgbm as lgb

In [2]:
gp = pd.read_csv('aggregated_features.csv')

In [3]:
train = pd.read_csv('train.csv', parse_dates=['activation_date'])
test = pd.read_csv('test.csv', parse_dates=['activation_date'])

train = train.merge(gp, on='user_id', how='left')
test = test.merge(gp, on='user_id', how='left')

agg_cols = list(gp.columns)[1:]

del gp; gc.collect()

train.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,activation_date,user_type,image,image_top_1,deal_probability,avg_days_up_user,avg_times_up_user,med_days_up_user,med_times_up_user,n_user_items
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),...,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789,8.0,2.0,8.0,2.0,2.0
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,...,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0,,,,,
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,...,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177,4.428571,1.142857,3.0,1.0,9.0
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,...,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323,16.714286,2.642857,18.0,3.0,32.0
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",...,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797,,,,,


In [4]:
train = train.sort_values(['activation_date']).reset_index(drop=True)

In [5]:
train_index, valid_index = train_test_split(np.arange(len(train)), test_size=0.1, random_state=519)

In [6]:
train.loc[train_index, 'activation_date'].dt.weekday.value_counts()

0    207048
6    205318
1    201131
2    196631
3    191293
4    176527
5    175133
Name: activation_date, dtype: int64

In [7]:
train.loc[valid_index, 'activation_date'].dt.weekday.value_counts()

0    23007
6    22614
1    22289
2    21884
3    21422
4    19599
5    19528
Name: activation_date, dtype: int64

In [8]:
test.loc[:, 'activation_date'].dt.weekday.value_counts()

2    81888
1    81114
0    80191
3    77177
4    70366
6    58909
5    58793
Name: activation_date, dtype: int64

One more thing about the approach that I haven't mentioned yet is that we will have quite some NaN values because not every ID in `train` and `test` occurs in `train_active` and `test_active`. Let's check how big that problem is.

In [9]:
train[agg_cols].isnull().any(axis=1).sum() / len(train) * 100

22.407185198586692

In [10]:
test[agg_cols].isnull().any(axis=1).sum() / len(test) * 100

24.354198545348694

In [11]:
train.isnull().sum(), test.isnull().sum()

(item_id                      0
 user_id                      0
 region                       0
 city                         0
 parent_category_name         0
 category_name                0
 param_1                  61576
 param_2                 654542
 param_3                 862565
 title                        0
 description             116276
 price                    85362
 item_seq_number              0
 activation_date              0
 user_type                    0
 image                   112588
 image_top_1             112588
 deal_probability             0
 avg_days_up_user        336875
 avg_times_up_user       336875
 med_days_up_user        336875
 med_times_up_user       336875
 n_user_items            336875
 dtype: int64, item_id                      0
 user_id                      0
 region                       0
 city                         0
 parent_category_name         0
 category_name                0
 param_1                  22910
 param_2                 2

We have missing features for 22.41% of train and 24.35% of test data. That's not perfect but certainly acceptable. Onto some more basic feature engineering with ideas from [a great kernel](https://www.kaggle.com/tunguz/bow-meta-text-and-dense-features-lb-0-2241?scriptVersionId=3603709).

In [12]:
import string

In [13]:
count = lambda l1,l2: sum([1 for x in l1 if x in l2])


for df in [train, test]:
    df['description'].fillna('unknowndescription', inplace=True)
    df['title'].fillna('unknowntitle', inplace=True)

    df['weekday'] = pd.to_datetime(df['activation_date']).dt.day
    
    for col in ['description', 'title']:
        df['num_words_' + col] = df[col].apply(lambda comment: len(comment.split()))
        df['num_unique_words_' + col] = df[col].apply(lambda comment: len(set(w for w in comment.split())))

    df['words_vs_unique_title'] = df['num_unique_words_title'] / df['num_words_title'] * 100
    df['words_vs_unique_description'] = df['num_unique_words_description'] / df['num_words_description'] * 100
    
    df['city'] = df['region'] + '_' + df['city']
    df['num_desc_punct'] = df['description'].apply(lambda x: count(x, set(string.punctuation)))
    
    for col in agg_cols:
        df[col].fillna(-1, inplace=True)
    
    for col in ['price', 'image_top_1']:
         df[col].fillna(-1, inplace=True)
            
    for col in ['param_1', 'param_2', 'param_3']:
         df[col].fillna('khwinkaggle', inplace=True)

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viola\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
count_vectorizer_title = CountVectorizer(stop_words=stopwords.words('russian'), lowercase=True, min_df=25)

title_counts = count_vectorizer_title.fit_transform(train['title'].append(test['title']))

train_title_counts = title_counts[:len(train)]
test_title_counts = title_counts[len(train):]


count_vectorizer_desc = TfidfVectorizer(stop_words=stopwords.words('russian'), 
                                        lowercase=True, ngram_range=(1, 2),
                                        max_features=15000)

desc_counts = count_vectorizer_desc.fit_transform(train['description'].append(test['description']))

train_desc_counts = desc_counts[:len(train)]
test_desc_counts = desc_counts[len(train):]

train_title_counts.shape, train_desc_counts.shape

((1503424, 16447), (1503424, 15000))

In [16]:
target = 'deal_probability'
predictors = [
    'num_desc_punct', 
    'words_vs_unique_description', 'num_unique_words_description', 'num_unique_words_title', 'num_words_description', 'num_words_title',
    'avg_times_up_user', 'avg_days_up_user', 'n_user_items', 
    'price', 'item_seq_number'
]
categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type'
]

predictors = predictors + categorical

In [17]:
for feature in categorical:
    print(f'Transforming {feature}...')
    encoder = LabelEncoder()
    encoder.fit(train[feature].append(test[feature]).astype(str))
    
    train[feature] = encoder.transform(train[feature].astype(str))
    test[feature] = encoder.transform(test[feature].astype(str))

Transforming image_top_1...
Transforming param_1...
Transforming param_2...
Transforming param_3...
Transforming city...
Transforming region...
Transforming category_name...
Transforming parent_category_name...
Transforming user_type...


## Cat Interaction

In [18]:
pairs = [('parent_category_name', 'category_name'), 
         ('parent_category_name', 'param_1'), ('parent_category_name', 'param_2'), ('parent_category_name', 'param_3'), 
         ('category_name', 'param_1'), ('category_name', 'param_2'), ('category_name', 'param_3'),
         ('parent_category_name', 'region'), ('category_name', 'region'),
         ('user_type', 'region'), 
         #('user_type', 'city'), ## too much
         ('user_type', 'parent_category_name'), ('user_type', 'category_name'),
         #('parent_category_name', 'image_top_1'), ('category_name', 'image_top_1'),
        ]

In [19]:
labelize_cols = ['parent_category_name_category_name', 
                 'parent_category_name_param_1', 
                 'parent_category_name_param_2', 
                 'parent_category_name_param_3', 
                 'category_name_param_1',
                 'category_name_param_2', 
                 'category_name_param_3', 
                 'parent_category_name_region', 'category_name_region',
                 'user_type_region', 'user_type_parent_category_name', 'user_type_category_name']

In [20]:
for pair in pairs:
    col1 = pair[0]
    col2 = pair[1]
    print('Processing pair:', pair)
    new_feature = col1+'_'+col2
    train.loc[:, new_feature] = train[col1].astype(str)+'_'+train[col2].astype(str)
    test.loc[:, new_feature] = test[col1].astype(str)+'_'+test[col2].astype(str)
    
    encoder = LabelEncoder()
    encoder.fit(train[new_feature].append(test[new_feature]).astype(str))
    
    train[new_feature] = encoder.transform(train[new_feature].astype(str))
    test[new_feature] = encoder.transform(test[new_feature].astype(str)) 

Processing pair: ('parent_category_name', 'category_name')
Processing pair: ('parent_category_name', 'param_1')
Processing pair: ('parent_category_name', 'param_2')
Processing pair: ('parent_category_name', 'param_3')
Processing pair: ('category_name', 'param_1')
Processing pair: ('category_name', 'param_2')
Processing pair: ('category_name', 'param_3')
Processing pair: ('parent_category_name', 'region')
Processing pair: ('category_name', 'region')
Processing pair: ('user_type', 'region')
Processing pair: ('user_type', 'parent_category_name')
Processing pair: ('user_type', 'category_name')


In [21]:
categorical.extend(labelize_cols)

## Test Models

In [25]:
target = 'deal_probability'
predictors = [
    'num_desc_punct', 
    'words_vs_unique_description', 'num_unique_words_description', 'num_unique_words_title', 'num_words_description', 'num_words_title',
    'avg_times_up_user', 'avg_days_up_user', 'n_user_items', 
    'price', 'item_seq_number'
]

categorical = [
    'image_top_1', 'param_1', 'param_2', 'param_3', 
    'city', 'region', 'category_name', 'parent_category_name', 'user_type',
    'parent_category_name_category_name', 
    'parent_category_name_param_1', 
    'parent_category_name_param_2', 
    'parent_category_name_param_3', 
    'category_name_param_1',
    'category_name_param_2', 
    'category_name_param_3', 
    'parent_category_name_region', 'category_name_region',
    'user_type_region', 'user_type_parent_category_name', 'user_type_category_name'
]

predictors = predictors + categorical

After some hyperparameter definitions and creating train / valid / test matrices, we can finally train the model. Let's see if the aggregated features helped.

*Note: For further feature engineering, I would recommend restricting the max_depth further (5 worked well for me) and increasing the learning rate (to ~ 0.1) so you don't have to wait forever for the training to finish.*

In [26]:
rounds = 24000
early_stop_rounds = 200
params = {
    'objective' : 'regression',
    'metric' : 'rmse',
    'num_leaves' : 48,
    'max_depth': 15,
    'learning_rate' : 0.02,
    'feature_fraction' : 0.6,
    'verbosity' : -1
}

feature_names = np.hstack([
    count_vectorizer_desc.get_feature_names(),
    count_vectorizer_title.get_feature_names(),
    predictors
])
print('Number of features:', len(feature_names))

Number of features: 31484


In [27]:
from sklearn.model_selection import KFold
kf = KFold(5, shuffle=True, random_state=411)

In [28]:
x_test = scipy.sparse.hstack([
    test_desc_counts,
    test_title_counts,
    test.loc[:, predictors]
], format='csr')

In [29]:
x_train = scipy.sparse.hstack([
        train_desc_counts,
        train_title_counts,
        train.loc[:, predictors]
    ], format='csr')
y_train = train.loc[:, target].values

In [30]:
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions
from sklearn.model_selection import train_test_split

In [31]:
ml = model_loader('lgb')
SEED=719

In [32]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':2000, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1., 
    'subsample_freq':1, 
    'colsample_bytree':.6, 
    'reg_alpha':0.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}
fit_param = {
    'feature_name': list(feature_names), 
    'categorical_feature': categorical,
}
fit_param.update({
    'early_stopping_rounds': 50,
    'verbose': 100,
    'eval_metric': 'rmse'
})

try_params = {
    'random_state': [719]
}

fit_params(x_train, y_train, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=True)



Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.222178	valid's rmse: 0.225392
[200]	train's rmse: 0.219733	valid's rmse: 0.224142
[300]	train's rmse: 0.218028	valid's rmse: 0.223395
[400]	train's rmse: 0.216692	valid's rmse: 0.222917
[500]	train's rmse: 0.215484	valid's rmse: 0.222488
[600]	train's rmse: 0.214536	valid's rmse: 0.222198
[700]	train's rmse: 0.21368	valid's rmse: 0.221961
[800]	train's rmse: 0.212794	valid's rmse: 0.221718
[900]	train's rmse: 0.212131	valid's rmse: 0.221577
[1000]	train's rmse: 0.211567	valid's rmse: 0.221464
[1100]	train's rmse: 0.210834	valid's rmse: 0.221298
[1200]	train's rmse: 0.210197	valid's rmse: 0.221179
[1300]	train's rmse: 0.209664	valid's rmse: 0.221062
[1400]	train's rmse: 0.209088	valid's rmse: 0.220955
[1500]	train's rmse: 0.208599	valid's rmse: 0.220886
[1600]	train's rmse: 0.208138	valid's rmse: 0.220816
[1700]	train's rmse: 0.207498	valid's rmse: 0.220722
[1800]	train's rmse: 0.20707	valid's rmse: 0.22



Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.222232	valid's rmse: 0.224877
[200]	train's rmse: 0.219528	valid's rmse: 0.22341
[300]	train's rmse: 0.217869	valid's rmse: 0.222723
[400]	train's rmse: 0.21647	valid's rmse: 0.222196
[500]	train's rmse: 0.215417	valid's rmse: 0.221873
[600]	train's rmse: 0.21449	valid's rmse: 0.221613
[700]	train's rmse: 0.213545	valid's rmse: 0.221351
[800]	train's rmse: 0.212774	valid's rmse: 0.221197
[900]	train's rmse: 0.21217	valid's rmse: 0.22105
[1000]	train's rmse: 0.211541	valid's rmse: 0.22093
[1100]	train's rmse: 0.210841	valid's rmse: 0.220778
[1200]	train's rmse: 0.210183	valid's rmse: 0.220641
[1300]	train's rmse: 0.209634	valid's rmse: 0.220534
[1400]	train's rmse: 0.20916	valid's rmse: 0.220461
[1500]	train's rmse: 0.208549	valid's rmse: 0.220353
[1600]	train's rmse: 0.208124	valid's rmse: 0.220303
[1700]	train's rmse: 0.207752	valid's rmse: 0.220259
[1800]	train's rmse: 0.20713	valid's rmse: 0.220151
[



Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.222269	valid's rmse: 0.22478
[200]	train's rmse: 0.219571	valid's rmse: 0.223431
[300]	train's rmse: 0.217895	valid's rmse: 0.222719
[400]	train's rmse: 0.216458	valid's rmse: 0.222174
[500]	train's rmse: 0.215417	valid's rmse: 0.221843
[600]	train's rmse: 0.214505	valid's rmse: 0.221613
[700]	train's rmse: 0.213613	valid's rmse: 0.221396
[800]	train's rmse: 0.212907	valid's rmse: 0.221223
[900]	train's rmse: 0.212104	valid's rmse: 0.221033
[1000]	train's rmse: 0.211368	valid's rmse: 0.220847
[1100]	train's rmse: 0.210714	valid's rmse: 0.220734
[1200]	train's rmse: 0.210063	valid's rmse: 0.220573
[1300]	train's rmse: 0.209482	valid's rmse: 0.220467
[1400]	train's rmse: 0.209001	valid's rmse: 0.220385
[1500]	train's rmse: 0.208512	valid's rmse: 0.22033
[1600]	train's rmse: 0.207937	valid's rmse: 0.220236
[1700]	train's rmse: 0.207506	valid's rmse: 0.220159
[1800]	train's rmse: 0.206984	valid's rmse: 0.22



Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.222132	valid's rmse: 0.225229
[200]	train's rmse: 0.219486	valid's rmse: 0.223918
[300]	train's rmse: 0.217795	valid's rmse: 0.223207
[400]	train's rmse: 0.21644	valid's rmse: 0.222704
[500]	train's rmse: 0.21526	valid's rmse: 0.222308
[600]	train's rmse: 0.214264	valid's rmse: 0.222003
[700]	train's rmse: 0.213476	valid's rmse: 0.221779
[800]	train's rmse: 0.212684	valid's rmse: 0.221573
[900]	train's rmse: 0.211945	valid's rmse: 0.221394
[1000]	train's rmse: 0.211268	valid's rmse: 0.221229
[1100]	train's rmse: 0.21059	valid's rmse: 0.22108
[1200]	train's rmse: 0.210039	valid's rmse: 0.220977
[1300]	train's rmse: 0.209504	valid's rmse: 0.220865
[1400]	train's rmse: 0.208833	valid's rmse: 0.220761
[1500]	train's rmse: 0.208296	valid's rmse: 0.220668
[1600]	train's rmse: 0.207811	valid's rmse: 0.220588
[1700]	train's rmse: 0.207354	valid's rmse: 0.220515
[1800]	train's rmse: 0.206945	valid's rmse: 0.2204



Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.222259	valid's rmse: 0.223884
[200]	train's rmse: 0.219827	valid's rmse: 0.22267
[300]	train's rmse: 0.218091	valid's rmse: 0.22192
[400]	train's rmse: 0.216836	valid's rmse: 0.221464
[500]	train's rmse: 0.21564	valid's rmse: 0.221046
[600]	train's rmse: 0.21466	valid's rmse: 0.220774
[700]	train's rmse: 0.213829	valid's rmse: 0.220562
[800]	train's rmse: 0.213097	valid's rmse: 0.220377
[900]	train's rmse: 0.2124	valid's rmse: 0.220224
[1000]	train's rmse: 0.211716	valid's rmse: 0.220069
[1100]	train's rmse: 0.211088	valid's rmse: 0.219942
[1200]	train's rmse: 0.210436	valid's rmse: 0.21984
[1300]	train's rmse: 0.209933	valid's rmse: 0.21976
[1400]	train's rmse: 0.209373	valid's rmse: 0.219644
[1500]	train's rmse: 0.208733	valid's rmse: 0.219536
[1600]	train's rmse: 0.208259	valid's rmse: 0.219479
[1700]	train's rmse: 0.207829	valid's rmse: 0.219407
[1800]	train's rmse: 0.207367	valid's rmse: 0.219355
[

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'random_state': 719},0.219968,0.000448


## Gen Submissions

In [None]:
ret = np.zeros((train.shape[0],))
ret_test = np.zeros((x_test.shape[0],))
ret_models = []
    
for train_ix, val_ix in kf.split(train):
    dtrain = lgb.Dataset(x_train[train_ix,:], label=y_train[train_ix],
                         feature_name=list(feature_names), 
                         categorical_feature=categorical)
    dvalid = lgb.Dataset(x_train[val_ix,:], label=y_train[val_ix],
                         feature_name=list(feature_names), 
                         categorical_feature=categorical)
    
    model = lgb.train(params, dtrain, 
                      valid_sets=[dtrain, dvalid], 
                      valid_names=['train', 'valid'],
                      num_boost_round=rounds, 
                      early_stopping_rounds=early_stop_rounds, 
                      verbose_eval=100)
    
    ret[val_ix] = model.predict(x_train[val_ix,:])
    ret_test += model.predict(x_test)
    ret_models.append(model)
    
    del dtrain, dvalid



Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 0.227302	valid's rmse: 0.228307
[200]	train's rmse: 0.222449	valid's rmse: 0.224899
[300]	train's rmse: 0.219561	valid's rmse: 0.223295
[400]	train's rmse: 0.217639	valid's rmse: 0.222424
[500]	train's rmse: 0.216249	valid's rmse: 0.221881
[600]	train's rmse: 0.215163	valid's rmse: 0.221479
[700]	train's rmse: 0.214225	valid's rmse: 0.221167
[800]	train's rmse: 0.213412	valid's rmse: 0.220925
[900]	train's rmse: 0.212658	valid's rmse: 0.220709
[1000]	train's rmse: 0.211986	valid's rmse: 0.22052
[1100]	train's rmse: 0.21138	valid's rmse: 0.220356
[1200]	train's rmse: 0.210782	valid's rmse: 0.220204
[1300]	train's rmse: 0.210244	valid's rmse: 0.220079
[1400]	train's rmse: 0.209696	valid's rmse: 0.219966
[1500]	train's rmse: 0.20921	valid's rmse: 0.219853
[1600]	train's rmse: 0.208748	valid's rmse: 0.219768
[1700]	train's rmse: 0.2083	valid's rmse: 0.219678
[1800]	train's rmse: 0.207868	valid's rmse: 0.2196

In [None]:
ret_test = ret_test / 5.

In [None]:
for model in ret_models:
    fig, ax = plt.subplots(figsize=(10, 14))
    lgb.plot_importance(model, max_num_features=50, ax=ax)
    plt.title("Light GBM Feature Importance")
    plt.show()

That looks good. But the model is kind of a black box. It is a good idea to plot the feature importances for our model now.

`avg_days_up`, `avg_times_up_user` and `n_user_items` are our most important engineered features! Looks like we were successful. Now we just have to predict the test matrix and submit!

In [None]:
prefix = 'cat_interact_'

In [None]:
pd.DataFrame(data=ret, columns=[prefix+'lgb_pred']).to_csv(prefix+'lgb_oof_val_pred.csv', index=False)
pd.DataFrame(data=ret_test, columns=[prefix+'lgb_pred']).to_csv(prefix+'lgb_oof_test_pred.csv', index=False)

In [None]:
subm = pd.read_csv('sample_submission.csv')
subm['deal_probability'] = np.clip(ret_test, 0, 1)
subm.to_csv(prefix+'submission.csv', index=False)

In [None]:
import pickle
for md in ret_models:
    with open('models/'+prefix +'lgb.model', 'wb') as handle:
        pickle.dump(md, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
ret_test

I'll end this kernel with some ideas to improve it:
- Use K-Fold cross validation.
- Try other methods than mean for reducing the aggregated features to one per user (e. g. modus or median).
- Try other gradient boosting libraries like CatBoost or XGBoost.
- Add a temporal dimension to engineered features (e. g. # of items a user put up for sale *per day*).
- Add more advanced text features like pretrained word embeddings.
- Add image features. At the moment we completely ignore images! (as discussed [here](https://www.kaggle.com/c/avito-demand-prediction/discussion/56678), two promising approaches could be [NIMA: Neural Image Assessment](https://arxiv.org/abs/1709.05424) and [Multimedia Features for Click Prediction](https://storage.googleapis.com/kaggle-forum-message-attachments/328059/9411/dimitri-clickadvert.pdf)).
- Normalize text before creating the Tf-Idf matrix (e. g. using [stemming](http://www.nltk.org/howto/stem.html)).
- ~~Learn russian and do in-depth text analysis.~~

Thanks for reading and have fun in this competition!