In [5]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
#from matplotlib_venn import venn2, venn2_circles
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import scipy
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold
import time

NFOLDS = 5
SEED = 42

In [2]:
#import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from scipy.sparse import csr_matrix, hstack
import lightgbm as lgb
import gc
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import re

import os

def bm25(corpus,b,k1, stopword):
    CV = CountVectorizer(ngram_range=(1,1), stop_words = stopword, min_df=5,max_df=0.3)
    IDFTrans = TfidfTransformer(norm='l2')
    
    output = CV.fit_transform(corpus)
    IDFTrans.fit(output)
    feature_names = CV.get_feature_names()
    temp = output.copy()
    
    aveL = output.sum()/output.shape[0]
    denominator = k1 * ((1-b)+b*(output.sum(1)/aveL))
    
    temp.data = temp.data/temp.data
    temp = csr_matrix.multiply(temp,denominator)
    
    temp += output
    output *= (k1+1)

    temp.data = 1/temp.data
    output = csr_matrix.multiply(output,temp)
    
    output = IDFTrans.transform(output)
    
    return output, feature_names
	
def cleanName(text):
    try:
        textProc = text.lower()
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"
		

sw = stopwords.words('russian')

print("Loading data")
train =pd.read_csv("/home/g492652607/data/train.csv") 
test =pd.read_csv("/home/g492652607/data/test.csv")
ntrain = train.shape[0]
ntest = test.shape[0]

categorical = ["user_id","city","parent_category_name","user_type","region","category_name"] # labelencoding
nullP = ["image_top_1","param_1","param_2","param_3"] # labelencoding with NA (add an indicator to identify whether it is NA)
isNA = [] # indicator of NA
dropOr = ["item_id","title","description"] # to drop

trainIndex=train.shape[0]
train_y = train.deal_probability
train_x = train.drop(columns="deal_probability")

tr_te = pd.concat([train_x,test],axis=0)

print("Feature engineering")
tr_te = tr_te.assign(mon=lambda x: pd.to_datetime(x['activation_date']).dt.month,
                     mday=lambda x: pd.to_datetime(x['activation_date']).dt.day,
                     week=lambda x: pd.to_datetime(x['activation_date']).dt.week,
                     wday=lambda x:pd.to_datetime(x['activation_date']).dt.dayofweek,
                     txt=lambda x:(x['title'].astype(str)+' '+x['description'].astype(str)))

del train, test, train_x
gc.collect()

tr_te["price"] = np.log(tr_te["price"]+0.001)
tr_te["price"].fillna(tr_te.price.mean(),inplace=True)

tr_te.drop(["activation_date","image"],axis=1,inplace=True)

# labelencoding with NA
lbl = preprocessing.LabelEncoder()
for col in nullP:
    toApp = tr_te[col].isnull()
    tr_te[col].fillna("Unknown",inplace = True)
    tr_te[col] = lbl.fit_transform(tr_te[col].astype(str))
    toApp *= 1
    theName = "isNA_" + col
    isNA.append(theName)
    tr_te = pd.concat([tr_te,toApp.rename(theName)],axis=1)

# labelencoding
for col in categorical:
    tr_te[col].fillna('Unknown')
    tr_te[col] = lbl.fit_transform(tr_te[col].astype(str))
	
tr_te.drop(labels=dropOr,axis=1,inplace=True)

tr_te.loc[:,'txt']=tr_te.txt.apply(lambda x:x.lower().replace("[^[:alpha:]]"," ").replace("\\s+", " "))
tr_te['txt'] = tr_te['txt'].apply(lambda x: cleanName(x))

print("Processing text")

m_tfidf, tfidf_feature = bm25(tr_te.txt,0.75,2,stopword=sw)

tr_te.drop(labels=['txt'],inplace=True,axis=1)

feature_list = tr_te.columns.values.tolist()
feature_list.extend(tfidf_feature)
categorical.extend(nullP)
categorical.extend(isNA)

data  = hstack((tr_te.values,m_tfidf)).tocsr()

del tr_te,m_tfidf
gc.collect()

dtest = data[trainIndex:]
train = data[:trainIndex]

del data
gc.collect()

Loading data
Feature engineering
Processing text


0

In [3]:
def cross_validate_lgb(params, x_train, y_train, x_test, kf, cat_cols=[],
                       verbose=True, verbose_eval=50, use_cat=True, use_rank=False):
    start_time = time.time()
    train_pred = np.zeros((ntrain))
    test_pred = np.zeros((ntest))

    if len(cat_cols)==0: use_cat=False

    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf): # folds 1, 2 ,3 ,4, 5
        # example: training from 1,2,3,4; validation from 5
        print('\nFold {}'.format(i))
        x_train_kf, x_val_kf = x_train[train_index], x_train[val_index]
        y_train_kf, y_val_kf = y_train[train_index], y_train[val_index]

        if use_cat:
            lgb_train = lgb.Dataset(x_train_kf, y_train_kf, feature_name=feature_list,categorical_feature=cat_cols)
            lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train, feature_name=feature_list,categorical_feature=cat_cols)
        else:
            lgb_train = lgb.Dataset(x_train_kf, y_train_kf, feature_name=feature_list)
            lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train, feature_name=feature_list)

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=4000,
                        valid_sets=lgb_val,
                        early_stopping_rounds=30,
                        verbose_eval=verbose_eval)

        val_pred = gbm.predict(x_val_kf)

        if use_rank:
            train_pred[val_index] += probability_to_rank(val_pred)
            test_pred += probability_to_rank(gbm.predict(x_test))
            # test_pred += gbm.predict(x_test)
        else:
            train_pred[val_index] += val_pred
            test_pred += gbm.predict(x_test)

        # test_pred += gbm.predict(x_test)
        rms = sqrt(mean_squared_error(y_val_kf.values, val_pred))
        if verbose:
            print('fold cv {} RMSE score is {:.6f}'.format(i, rms))

    test_pred /=NFOLDS

    cv_rms = sqrt(mean_squared_error(y_train, train_pred))
    if verbose:
        print('cv RMSE score is {:.6f}'.format(cv_rms))
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))
    return train_pred.reshape(-1, 1),test_pred.reshape(-1, 1)

In [7]:
kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED)
lgb_params =  {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 270,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.016,
    'verbose': 0
}  
gbm_oof_train, gbm_oof_test=cross_validate_lgb(lgb_params,train, train_y, dtest, kf, cat_cols=categorical, use_cat=True, 
                            verbose_eval=False, use_rank=False)


gbm_preds = np.concatenate([gbm_oof_train, gbm_oof_test])


Fold 0




fold cv 0 RMSE score is 0.218641

Fold 1
fold cv 1 RMSE score is 0.218914

Fold 2
fold cv 2 RMSE score is 0.218556

Fold 3
fold cv 3 RMSE score is 0.218676

Fold 4
fold cv 4 RMSE score is 0.218772
cv RMSE score is 0.218712
it takes 20275.585 seconds to perform cross validation


In [13]:
subm = pd.read_csv('/home/g492652607/data/sample_submission.csv')
subm['deal_probability'] = np.clip(gbm_oof_test, 0, 1)
subm.to_csv('testbm25.csv', index=False)

In [14]:
blend = pd.read_csv('/home/g492652607/data/blend.csv') 
idx=blend['item_id']
sub = pd.DataFrame(gbm_preds,columns=['gbm_pred'])
sub1 = pd.DataFrame(idx,columns=['item_id'])
sub1=sub1.set_index(sub.index)
gb=pd.concat([sub1,sub],axis=1)
gb.to_csv('gbm_bm25.csv', index=False)

In [12]:
gb.to_csv('bm25.csv', index=False)

In [16]:
gb.shape

(2011862, 2)