In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
import xgboost as xgb
import gc
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import os
# print(os.listdir("../input"))

In [None]:
sw = stopwords.words('russian')

In [None]:
train =pd.read_csv("../input/train.csv") 
test =pd.read_csv("../input/test.csv")

In [None]:
# user_type: 3
# parent_category_name: 9
# region: 28
# category_name: 47
# city: 1733

# check NA before adding
oneHot = ["parent_category_name","user_type","region","category_name"] # one hot 
categorical = ["user_id","city"] # labelencoding
nullP = ["image_top_1","param_1","param_2","param_3"] # labelencoding with NA (add an indicator to identify whether it is NA)
dropOr = ["item_id","title","description"] # to drop

In [None]:
trainIndex=train.shape[0]
train_y = train.deal_probability
train_x = train.drop(columns="deal_probability")

tr_te = pd.concat([train_x,test],axis=0)

In [None]:
tr_te = tr_te.assign(mon=lambda x: pd.to_datetime(x['activation_date']).dt.month,
                     mday=lambda x: pd.to_datetime(x['activation_date']).dt.day,
                     week=lambda x: pd.to_datetime(x['activation_date']).dt.week,
                     wday=lambda x:pd.to_datetime(x['activation_date']).dt.dayofweek,
                     txt=lambda x:(x['title'].astype(str)+' '+x['description'].astype(str)))

In [None]:
del train, test, train_x
gc.collect()

In [None]:
tr_te["price"] = np.log(tr_te["price"]+0.001)
tr_te["price"].fillna(tr_te.price.mean(),inplace=True)

tr_te.drop(["activation_date","image"],axis=1,inplace=True)

In [None]:
# labelencoding with NA
lbl = preprocessing.LabelEncoder()
for col in nullP:
    toApp = tr_te[col].isnull()
    tr_te[col] = lbl.fit_transform(tr_te[col].astype(str))
    tr_te.loc[toApp, col] = -999
    toApp *= 1
    theName = "isNA_" + col
    tr_te = pd.concat([tr_te,toApp.rename(theName)],axis=1)

In [None]:
# labelencoding
for col in categorical:
    tr_te[col].fillna('Unknown')
    tr_te[col] = lbl.fit_transform(tr_te[col].astype(str))

In [None]:
# one-hot
for col in oneHot:
    temp = pd.get_dummies(tr_te[col],prefix = col)
    tr_te.drop(columns=col,inplace=True)
    tr_te = pd.concat([tr_te,temp],axis=1)

In [None]:
tr_te.drop(labels=dropOr,axis=1,inplace=True)

In [None]:
tr_te.loc[:,'txt']=tr_te.txt.apply(lambda x:x.lower().replace("[^[:alpha:]]"," ").replace("\\s+", " "))

In [None]:
def tokenizeL(text):
    return [ w for w in str(text).split()]
    
vec=TfidfVectorizer(ngram_range=(1,1),stop_words=sw,min_df=5,max_df=0.3,sublinear_tf=True,norm='l2',max_features=5000)
m_tfidf=vec.fit_transform(tr_te.txt)

tr_te.drop(labels=['txt'],inplace=True,axis=1)

In [None]:
data  = hstack((tr_te.values,m_tfidf)).tocsr()

del tr_te,m_tfidf
gc.collect()

In [None]:
dtest = xgb.DMatrix(data=data[trainIndex:], missing = -999)
train = data[:trainIndex]

del data
gc.collect()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train, train_y,test_size = 0.1, random_state=5566)

del train, train_y
gc.collect()

In [None]:
dtrain = xgb.DMatrix(data = X_train, label=y_train, missing = -999)
deval = xgb.DMatrix(data = X_valid, label=y_valid, missing = -999)
watchlist = [(deval, 'eval')]

del X_train, X_valid, y_train, y_valid
gc.collect()

In [None]:
Dparam = {'objective' : "reg:logistic",
          'booster' : "gbtree",
          'eval_metric' : "rmse",
          'nthread' : 4,
          'eta':0.05,
          'max_depth':18,
          'min_child_weight': 11,
          'gamma' :0,
          'subsample':0.8,
          'colsample_bytree':0.7,
          'aplha':2.25,
          'lambda':0,
          'nrounds' : 5000}

In [None]:
xgb_clf = xgb.train(params=Dparam,dtrain=dtrain,num_boost_round=Dparam['nrounds'],early_stopping_rounds=50,evals=watchlist,verbose_eval=10)

In [None]:
pd.read_csv("../input/sample_submission.csv").assign(deal_probability = xgb_clf.predict(dtest)).to_csv("xgb_2.csv", index=False)