<a href="https://colab.research.google.com/github/vvivvi/kaggle-c1/blob/master/Kaggle_C1_text_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm 

for p in [np, pd, scipy, sklearn, lightgbm]:
    print (p.__name__, p.__version__)

numpy 1.18.1
pandas 0.25.3
scipy 1.4.1
sklearn 0.22.1
lightgbm 2.3.1


In [2]:
def write_predictions_by_array(array, filename):
  df=pd.DataFrame(array)
  df.columns=['item_cnt_month']
  df.to_csv(os.path.join(DATA_FOLDER, filename), index_label='ID')

In [3]:
def clipped_rmse(gt, predicted,clip_min=0, clip_max=20):
  target=np.minimum(np.maximum(gt,clip_min), clip_max)
  return np.sqrt((target-predicted)**2).mean()

In [4]:
from itertools import product
import gc
from tqdm import tqdm_notebook


def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [5]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 

#!wget -O competitive-data-science-predict-future-sales.zip https://github.com/vvivvi/kaggle-c1/blob/master/competitive-data-science-predict-future-sales.zip?raw=true
#!mkdir competitive-data-science-predict-future-sales
#!unzip -o competitive-data-science-predict-future-sales.zip -d competitive-data-science-predict-future-sales

DATA_FOLDER = 'competitive-data-science-predict-future-sales'

sales    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_cats = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
sample_submission = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv'))
test_spec = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))

#monthly_summary_train=sales[['date_block_num','shop_id','item_id','item_cnt_day']].groupby(['date_block_num','shop_id','item_id'], as_index=False).sum()
#monthly_summary_dec14 = monthly_summary_train[monthly_summary_train['date_block_num'] == 23]
#monthly_summary_oct15 = monthly_summary_train[monthly_summary_train['date_block_num'] == 33]

Plan for using textual features: 

Turn item names into TF-IDF transformed BOW-vectors. Cluster the vectors in order to get new categorization of products. Actually, create 2 such clusterings (32 and 256) such clusters and another two using 2-grams.


In [6]:
items

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40
...,...,...,...
22165,"Ядерный титбит 2 [PC, Цифровая версия]",22165,31
22166,Язык запросов 1С:Предприятия [Цифровая версия],22166,54
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,22167,49
22168,Яйцо для Little Inu,22168,62


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus=items['item_name'].values
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=2)
item_name_bow=vectorizer.fit_transform(corpus)
#print(vectorizer.get_feature_names())
print(item_name_bow.shape)

vectorizer_bigram = TfidfVectorizer(sublinear_tf=True, ngram_range=(2,2), min_df=2)
item_name_bow_bigram=vectorizer_bigram.fit_transform(corpus)
#print(vectorizer_bigram.get_feature_names())
print(item_name_bow_bigram.shape)


(22170, 9530)
(22170, 13242)


In [8]:
from sklearn.cluster import KMeans



In addition to thew TFIDF representations that emphasize the the terms distinctive to documents,
construct a representation from the most frequent words. Hand-pick from the most frequent terms 
the ones that are likely to flag relevant product attributes.

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_counts=CountVectorizer(binary=True, max_features=300)
count_matrix=vectorizer_counts.fit_transform(corpus)


In [10]:
counts=np.ravel(np.sum(count_matrix.todense(), axis=0))
idx=np.argsort(-counts)
np.ravel(counts[idx])
df=pd.DataFrame()
df['term']=np.array(vectorizer_counts.get_feature_names())[list(idx)]
df['frequency']=np.ravel(counts[idx])
df.to_csv('frequent_item_name_terms.csv')
         

In [11]:
!wget -O stopwords_item_name.csv https://github.com/vvivvi/kaggle-c1/blob/master/stopwords_item_name.csv?raw=true
df_stop=pd.read_csv('stopwords_item_name.csv')
df_stop

--2020-03-18 11:10:40--  https://github.com/vvivvi/kaggle-c1/blob/master/stopwords_item_name.csv?raw=true
Selvitetään osoitetta github.com (github.com)... 140.82.118.3
Yhdistetään palvelimeen github.com (github.com)|140.82.118.3|:443... yhdistetty.
HTTP-pyyntö lähetetty, odotetaan vastausta... 302 Found
Sijainti: https://github.com/vvivvi/kaggle-c1/raw/master/stopwords_item_name.csv [seurataan]
--2020-03-18 11:10:41--  https://github.com/vvivvi/kaggle-c1/raw/master/stopwords_item_name.csv
Käytetään uudelleen yhteyttä github.com:443.
HTTP-pyyntö lähetetty, odotetaan vastausta... 302 Found
Sijainti: https://raw.githubusercontent.com/vvivvi/kaggle-c1/master/stopwords_item_name.csv [seurataan]
--2020-03-18 11:10:41--  https://raw.githubusercontent.com/vvivvi/kaggle-c1/master/stopwords_item_name.csv
Selvitetään osoitetta raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.84.133
Yhdistetään palvelimeen raw.githubusercontent.com (raw.githubusercontent.com)|151.101.84.133|:443...

Unnamed: 0.1,Unnamed: 0,term,frequency
0,0,версия,3519
1,7,the,1118
2,10,of,912
3,18,для,625
4,24,на,478
5,46,по,223
6,57,10,182
7,58,за,179
8,62,из,167
9,66,in,161


These are the rather arbitrarily hand-picked stopwords. We notice that their total number does not hugely mask the other frequent words. We would probably do just fine completely without stopword list. However, since we already picked it, let's use it: 

In [12]:
stopwords=list(df_stop['term'])
vectorizer_counts=CountVectorizer(binary=True, max_features=300-len(stopwords), stop_words=stopwords)
count_matrix_frequent=vectorizer_counts.fit_transform(corpus)

Now create the six new clusterings of items with the K means clustering algorithm

In [13]:
from sklearn.cluster import KMeans

kmeans_bow32 = KMeans(n_clusters=32, random_state=123, n_jobs=-1).fit(item_name_bow)
print('.')
kmeans_bow256 = KMeans(n_clusters=256, random_state=123, n_jobs=-1).fit(item_name_bow)
print('.')

kmeans_bow_bigram32 = KMeans(n_clusters=32, random_state=123, n_jobs=-1).fit(item_name_bow_bigram)
print('.')
kmeans_bow_bigram256 = KMeans(n_clusters=256, random_state=123, n_jobs=-1).fit(item_name_bow_bigram)
print('.')

kmeans_bow_frequent32 = KMeans(n_clusters=32, random_state=123, n_jobs=-1).fit(count_matrix_frequent)
print('.')
kmeans_bow_frequent256 = KMeans(n_clusters=256, random_state=123, n_jobs=-1).fit(count_matrix_frequent)
print('.')



.
.
.
.
.
.


In [14]:
# collect clustering results to a pandas dataframe and save to csv file
items_clustered=pd.DataFrame(items,copy=True)

items_clustered['item_name_category_tfidf_unigram_32']=kmeans_bow32.labels_
items_clustered['item_name_category_tfidf_unigram_256']=kmeans_bow256.labels_

items_clustered['item_name_category_tfidf_bigram_32']=kmeans_bow_bigram32.labels_
items_clustered['item_name_category_tfidf_bigram_256']=kmeans_bow_bigram256.labels_

items_clustered['item_name_category_frequent_32']=kmeans_bow_frequent32.labels_
items_clustered['item_name_category_frequent_256']=kmeans_bow_frequent256.labels_


In [15]:
import re

def cyrillic_fraction(str):
    if len(str) == 0:
        return 0
    non_cyrillic=re.sub('[\u0400-\u04FF]', '',str)
    return 1.0-len(non_cyrillic)/len(str)

In [16]:
items_clustered['item_name_cyrillic_fraction']=items_clustered['item_name'].map(cyrillic_fraction)

In [17]:
items_clustered.to_csv(DATA_FOLDER + '/items_with_textual_categories.csv')

In [18]:
test_shops=test_spec['shop_id'].unique()
test_items=test_spec['item_id'].unique()
date_block_val = 33
date_block_test = 35 # Dec 2015
sales = sales[sales['shop_id'].isin(test_shops)]

In [19]:
shop_and_item_id_to_test_id={}

sr = test_spec['item_id'].astype(str) + '_' + test_spec['shop_id'].astype(str)
for index, val in sr.iteritems():
  shop_and_item_id_to_test_id[val] = index



In [20]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# add test month     
grid.append(np.array(list(product(*[test_shops, test_items, [date_block_test]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


In [21]:
all_data=pd.merge(all_data, items_clustered, on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect()


22

Aggregate 'target' by newly added item categories and shop-{category, item_id} combinations



In [22]:
# first aggregate by item categories as such

category_cols = [col for col in all_data.columns if re.search('category', col)]


for column in category_cols:
  targetsuffix=column[column.find('category'):]
  targetsuffix=re.sub('_id', '', targetsuffix)
  print(column,"->",targetsuffix)  
  gb = all_data.groupby(['date_block_num',column],as_index=False).agg({'target':{'target_'+targetsuffix:'sum'}})
  gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
  all_data = pd.merge(all_data, gb, how='left', on=['date_block_num', column]).fillna(0)
  all_data = downcast_dtypes(all_data)
  gc.collect()


del gb 
gc.collect();

# then add aggregates by shop-(additional variable) combinations
aux_vars = category_cols + ['item_id']
for column in aux_vars:
  targetsuffix = column[column.find('category'):] if column.find('category') >=0 else column
  targetsuffix=re.sub('_id', '', targetsuffix)
  targetsuffix += '_within_shop'
  print(column,"->",targetsuffix)  
  gb = all_data.groupby(['shop_id','date_block_num',column],as_index=False).agg({'target':{'target_'+targetsuffix:'sum'}})
  gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
  all_data = pd.merge(all_data, gb, how='left', on=['shop_id','date_block_num', column]).fillna(0)
  all_data = downcast_dtypes(all_data)
  gc.collect()

del gb 
gc.collect();

item_category_id -> category
item_name_category_tfidf_unigram_32 -> category_tfidf_unigram_32
item_name_category_tfidf_unigram_256 -> category_tfidf_unigram_256
item_name_category_tfidf_bigram_32 -> category_tfidf_bigram_32
item_name_category_tfidf_bigram_256 -> category_tfidf_bigram_256
item_name_category_frequent_32 -> category_frequent_32
item_name_category_frequent_256 -> category_frequent_256
item_category_id -> category_within_shop
item_name_category_tfidf_unigram_32 -> category_tfidf_unigram_32_within_shop
item_name_category_tfidf_unigram_256 -> category_tfidf_unigram_256_within_shop
item_name_category_tfidf_bigram_32 -> category_tfidf_bigram_32_within_shop
item_name_category_tfidf_bigram_256 -> category_tfidf_bigram_256_within_shop
item_name_category_frequent_32 -> category_frequent_32_within_shop
item_name_category_frequent_256 -> category_frequent_256_within_shop
item_id -> item_within_shop


In [23]:
all_data.to_csv(DATA_FOLDER + '/all_data_with_category_targets.csv')

In [None]:
cols_to_rename = [col for col in list(all_data.columns.difference(index_cols)) if re.search('target',col)] 
cols_to_rename

In [None]:
# List of columns that we will use to create lags
# cols_to_rename = list(all_data.columns.difference(index_cols)) 
cols_to_rename = [col for col in list(all_data.columns.difference(index_cols)) if re.search('target',col)] 

shift_range = [2, 3, 5,12] 
# there's one month gap between and test periods

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift
gc.collect()

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col.split('_')[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

In [None]:
all_data.columns.values

In [None]:
dates = all_data['date_block_num']

dates_train = dates[dates <  date_block_val]
dates_val  = dates[dates == date_block_val]
dates_test  = dates[dates == date_block_test]

to_drop_cols=[col for col in all_data.columns.values if ((not re.search('lag',col)) 
              and (not re.search('_id',col))
              and (not re.search('fraction',col)))] + ['item_name']

X_train = all_data.loc[dates <  date_block_val].drop(to_drop_cols, axis=1)
X_val =  all_data.loc[dates == date_block_val].drop(to_drop_cols, axis=1)
X_trainval =  all_data.loc[dates < date_block_test].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == date_block_test].drop(to_drop_cols, axis=1)

y_train = np.clip(all_data.loc[dates <  date_block_val, 'target'].values,0,20)
y_trainval = np.clip(all_data.loc[dates <  date_block_test, 'target'].values,0,20)
y_val =  np.clip(all_data.loc[dates == date_block_val, 'target'].values,0,20)

In [None]:
X_train.columns

In [None]:
import lightgbm as lgb
from sklearn.metrics import r2_score

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }
model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb_val = np.clip(model.predict(X_val), 0, 20)
print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
print('Clipped RMSE of lgb predictions is ', clipped_rmse(y_val, pred_lgb_val))

model = lgb.train(lgb_params, lgb.Dataset(X_trainval, label=y_trainval), 100)
pred_lgb_test = np.clip(model.predict(X_test), 0, 20)
write_predictions_by_array(pred_lgb_test, 'submission-lgb-trainval-all-ext-rich-categories.csv')
# LB score 0.949 / 0.953
# ext: 0.943/0.


In [None]:
X_train

In [None]:
!pip install catboost

In [None]:
import catboost

from catboost import CatBoostRegressor, Pool

model=CatBoostRegressor(iterations=100)
model.fit(X_train, y_train, verbose=1)
pred_catboost_val = model.predict(X_val.values)

print('Validation R-squared for catboost is %f' % r2_score(y_val, pred_catboost_val))



In [None]:
model=CatBoostRegressor(iterations=100)
model.fit(X_trainval, y_trainval, verbose=1)
pred_catboost_test = model.predict(X_test)
write_predictions_by_array(np.clip(pred_catboost_test,0,20), 'submission-catboost-trainval-ext.csv')

# LB score 0.986/0.996


In [None]:
import lightgbm as lgb

lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
              }

model = lgb.train(lgb_params, lgb.Dataset(X_val, label=y_val), 100)
pred_lgb_test = np.clip(model.predict(X_test), 0, 20)
write_predictions_by_array(pred_lgb_test, 'submission-lgb-valonly.csv')
# LB score 1.10/1.10
