## Stage 1

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
from copy import deepcopy as cp
import gc; gc.enable()

In [None]:
train_usecols = ['region', 'city', 'parent_category_name', 'category_name', 'title', 'description', 'deal_probability']
test_usecols = cp(train_usecols)
test_usecols.remove('deal_probability')

In [None]:
train_df = pd.read_csv("data/train.csv", usecols=train_usecols)
test_df = pd.read_csv("data/test.csv", usecols=test_usecols)
print("Train file rows and columns are : ", train_df.shape)
print("Test file rows and columns are : ", test_df.shape)

In [None]:
test_df['deal_probability'] = np.zeros((test_df.shape[0],))
all_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

In [None]:
display(train_df.head(5))
display(test_df.head(5))

In [None]:
def print_contains_info(df, col1, col2):
    str1s = df[col1].values
    str2s = df[col2].values
    
    contains_count = 0
    for i in range(len(str1s)):
        str1 = str(str1s[i])
        str2 = str(str2s[i])
        str1 = str1.split(" ")
        str2 = str2.split(" ")
        
        for s in str1:
            if s in str2:
                contains_count += 1
                break
                
    print('{} in {} contains counts:\n'.format(col1, col2), contains_count)

In [None]:
print_contains_info(all_df, 'region', 'title')
print_contains_info(all_df, 'city', 'title')
print_contains_info(all_df, 'parent_category_name', 'title')
print_contains_info(all_df, 'category_name', 'title')
print_contains_info(all_df, 'region', 'description')
print_contains_info(all_df, 'city', 'description')
print_contains_info(all_df, 'parent_category_name', 'description')
print_contains_info(all_df, 'category_name', 'description')
print_contains_info(all_df, 'title', 'description')

In [None]:
def get_contains_feature(df, col1, col2):
    print('processing ', col1, col2)
    res_df = pd.DataFrame()
    str1s = df[col1].values
    str2s = df[col2].values
    
    is_contains = []
    contains_counts = []
    for i in range(len(str1s)):
        str1 = str(str1s[i])
        str2 = str(str2s[i])
        str1 = str1.split(" ")
        str2 = str2.split(" ")
        
        contains_count = 0
        for s in str1:
            if s in str2:
                contains_count += 1
                
        is_contains.append(1 if contains_count > 0 else 0)
        contains_counts.append(contains_count)
        
    res_df['{}_in_{}'.format(col1,col2)] = is_contains
    res_df['{}_in_{}_counts'.format(col1,col2)] = contains_counts
    del is_contains, contains_counts; gc.collect()
    return res_df 

In [None]:
all_df = pd.concat([all_df, get_contains_feature(all_df, 'region', 'title')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'city', 'title')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'parent_category_name', 'title')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'category_name', 'title')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'region', 'description')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'city', 'description')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'parent_category_name', 'description')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'category_name', 'description')],  axis=1)
all_df = pd.concat([all_df, get_contains_feature(all_df, 'title', 'description')],  axis=1)

In [None]:
norm_cols = [c for c in all_df.columns if 'counts' in c]
for col in norm_cols:
    all_df[col] = (all_df[col] - all_df[col].mean())/all_df[col].std() 

In [None]:
text_feature = all_df['region'] + ' ' + all_df['city'] + ' ' + \
                all_df['parent_category_name'] + ' ' + all_df['category_name'] + ' ' + \
                all_df['title'] + ' ' + all_df['description']

In [None]:
all_df = all_df.drop(test_usecols, axis=1)
gc.collect()

In [None]:
all_df.to_csv('text_other_features.csv', index=False)

In [None]:
stopwords = ['а', 'е', 'и', 'ж', 'м', 'о', 'на', 'не', 'ни', 'об', 'но', 'он', 'мне', 'мои', 'мож', 'она', 'они', 'оно', 'мной', 'много', 'многочисленное', 'многочисленная', 'многочисленные', 'многочисленный', 'мною', 'мой', 'мог', 'могут', 'можно', 'может', 'можхо', 'мор', 'моя', 'моё', 'мочь', 'над', 'нее', 'оба', 'нам', 'нем', 'нами', 'ними', 'мимо', 'немного', 'одной', 'одного', 'менее', 'однажды', 'однако', 'меня', 'нему', 'меньше', 'ней', 'наверху', 'него', 'ниже', 'мало', 'надо', 'один', 'одиннадцать', 'одиннадцатый', 'назад', 'наиболее', 'недавно', 'миллионов', 'недалеко', 'между', 'низко', 'меля', 'нельзя', 'нибудь', 'непрерывно', 'наконец', 'никогда', 'никуда', 'нас', 'наш', 'нет', 'нею', 'неё', 'них', 'мира', 'наша', 'наше', 'наши', 'ничего', 'начала', 'нередко', 'несколько', 'обычно', 'опять', 'около', 'мы', 'ну', 'нх', 'от', 'отовсюду', 'особенно', 'нужно', 'очень', 'отсюда', 'в', 'во', 'вон', 'вниз', 'внизу', 'вокруг', 'вот', 'восемнадцать', 'восемнадцатый', 'восемь', 'восьмой', 'вверх', 'вам', 'вами', 'важное', 'важная', 'важные', 'важный', 'вдали', 'везде', 'ведь', 'вас', 'ваш', 'ваша', 'ваше', 'ваши', 'впрочем', 'весь', 'вдруг', 'вы', 'все', 'второй', 'всем', 'всеми', 'времени', 'время', 'всему', 'всего', 'всегда', 'всех', 'всею', 'всю', 'вся', 'всё', 'всюду', 'г', 'год', 'говорил', 'говорит', 'года', 'году', 'где', 'да', 'ее', 'за', 'из', 'ли', 'же', 'им', 'до', 'по', 'ими', 'под', 'иногда', 'довольно', 'именно', 'долго', 'позже', 'более', 'должно', 'пожалуйста', 'значит', 'иметь', 'больше', 'пока', 'ему', 'имя', 'пор', 'пора', 'потом', 'потому', 'после', 'почему', 'почти', 'посреди', 'ей', 'два', 'две', 'двенадцать', 'двенадцатый', 'двадцать', 'двадцатый', 'двух', 'его', 'дел', 'или', 'без', 'день', 'занят', 'занята', 'занято', 'заняты', 'действительно', 'давно', 'девятнадцать', 'девятнадцатый', 'девять', 'девятый', 'даже', 'алло', 'жизнь', 'далеко', 'близко', 'здесь', 'дальше', 'для', 'лет', 'зато', 'даром', 'первый', 'перед', 'затем', 'зачем', 'лишь', 'десять', 'десятый', 'ею', 'её', 'их', 'бы', 'еще', 'при', 'был', 'про', 'процентов', 'против', 'просто', 'бывает', 'бывь', 'если', 'люди', 'была', 'были', 'было', 'будем', 'будет', 'будете', 'будешь', 'прекрасно', 'буду', 'будь', 'будто', 'будут', 'ещё', 'пятнадцать', 'пятнадцатый', 'друго', 'другое', 'другой', 'другие', 'другая', 'других', 'есть', 'пять', 'быть', 'лучше', 'пятый', 'к', 'ком', 'конечно', 'кому', 'кого', 'когда', 'которой', 'которого', 'которая', 'которые', 'который', 'которых', 'кем', 'каждое', 'каждая', 'каждые', 'каждый', 'кажется', 'как', 'какой', 'какая', 'кто', 'кроме', 'куда', 'кругом', 'с', 'т', 'у', 'я', 'та', 'те', 'уж', 'со', 'то', 'том', 'снова', 'тому', 'совсем', 'того', 'тогда', 'тоже', 'собой', 'тобой', 'собою', 'тобою', 'сначала', 'только', 'уметь', 'тот', 'тою', 'хорошо', 'хотеть', 'хочешь', 'хоть', 'хотя', 'свое', 'свои', 'твой', 'своей', 'своего', 'своих', 'свою', 'твоя', 'твоё', 'раз', 'уже', 'сам', 'там', 'тем', 'чем', 'сама', 'сами', 'теми', 'само', 'рано', 'самом', 'самому', 'самой', 'самого', 'семнадцать', 'семнадцатый', 'самим', 'самими', 'самих', 'саму', 'семь', 'чему', 'раньше', 'сейчас', 'чего', 'сегодня', 'себе', 'тебе', 'сеаой', 'человек', 'разве', 'теперь', 'себя', 'тебя', 'седьмой', 'спасибо', 'слишком', 'так', 'такое', 'такой', 'такие', 'также', 'такая', 'сих', 'тех', 'чаще', 'четвертый', 'через', 'часто', 'шестой', 'шестнадцать', 'шестнадцатый', 'шесть', 'четыре', 'четырнадцать', 'четырнадцатый', 'сколько', 'сказал', 'сказала', 'сказать', 'ту', 'ты', 'три', 'эта', 'эти', 'что', 'это', 'чтоб', 'этом', 'этому', 'этой', 'этого', 'чтобы', 'этот', 'стал', 'туда', 'этим', 'этими', 'рядом', 'тринадцать', 'тринадцатый', 'этих', 'третий', 'тут', 'эту', 'суть', 'чуть', 'тысяч']

In [None]:
def clean_text(txt):
    words = str(txt).split()
    words = [wrd for wrd in words if wrd not in stopwords]
    words = [wrd for wrd in words if len(wrd) > 1]
    txt = " ".join(words)
    return txt

In [None]:
text_feature = text_feature.apply(lambda x: clean_text(x))

In [None]:
gc.collect()

In [None]:
import pickle

In [None]:
with open('text_feature.pickle', 'wb') as handle:
    pickle.dump(text_feature, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Stage 2

In [None]:
import pickle
import gc; gc.enable()
with open('text_feature.pickle', 'rb') as handle:
    text_feature = pickle.load(handle)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
tf_idf_features = TfidfVectorizer(ngram_range=(1,2), analyzer='word', norm='l2').fit_transform(text_feature.values)
with open('tf_idf_word_features.pickle', 'wb') as handle:
    pickle.dump(tf_idf_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
tf_idf_features = (tf_idf_features > 0).astype(int)
with open('tf_idf_bin_word_features.pickle', 'wb') as handle:
    pickle.dump(tf_idf_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
del tf_idf_features; gc.collect()

In [None]:
tf_idf_char_features = TfidfVectorizer(ngram_range=(2,2), analyzer='char', norm='l2').fit_transform(text_feature.values)
with open('tf_idf_char_features.pickle', 'wb') as handle:
    pickle.dump(tf_idf_char_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
tf_idf_char_features = (tf_idf_char_features > 0).astype(int)
with open('tf_idf_char_bin_features.pickle', 'wb') as handle:
    pickle.dump(tf_idf_char_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
del tf_idf_char_features; gc.collect()

## Stage 3: Concat them all

In [None]:
from scipy.sparse import hstack, csr_matrix
import pickle
import gc; gc.enable()
from sklearn.decomposition import TruncatedSVD

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('text_other_features.csv')
df.drop('deal_probability', axis=1, inplace=True)
df.head(3)

In [None]:
with open('tf_idf_word_features.pickle', 'rb') as handle:
    tf_idf_word_features = pickle.load(handle)
    
with open('tf_idf_bin_word_features.pickle', 'rb') as handle:
    tf_idf_word_bin_features = pickle.load(handle)
    
with open('tf_idf_char_features.pickle', 'rb') as handle:
    tf_idf_char_features = pickle.load(handle)

In [None]:
tf_idf_features = hstack([csr_matrix(df.values), tf_idf_word_features, tf_idf_word_bin_features, 
                            tf_idf_char_features]).tocsr()

with open('text_all_features.pickle', 'wb') as handle:
    pickle.dump(tf_idf_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Stage 4: Perform truncated SVD on all text features

In [None]:
import pickle
import pandas as pd
import numpy as np
import gc; gc.enable()
from sklearn.decomposition import TruncatedSVD

In [None]:
with open('text_all_features.pickle', 'rb') as handle:
    tf_idf_features = pickle.load(handle)

tf_idf_features.shape

In [None]:
tf_idf_features = tf_idf_features[:, np.array(np.clip(tf_idf_features.getnnz(axis=0) - 5, 0, 1), dtype=bool)]; gc.collect()
tf_idf_features.shape

In [None]:
tf_idf_features = TruncatedSVD(n_components=100, random_state=719, algorithm='arpack').fit_transform(tf_idf_features)
print(tf_idf_features.shape)

In [None]:
with open('text_all_features_svd.pickle', 'wb') as handle:
    pickle.dump(tf_idf_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Stage 5: Fit Models

In [1]:
import pickle
import pandas as pd
import numpy as np
import gc; gc.enable()
from sklearn.decomposition import TruncatedSVD

In [2]:
with open('text_all_features_svd.pickle', 'rb') as handle:
    tf_idf_features = pickle.load(handle)

In [3]:
train_df = pd.read_csv("data/train.csv", usecols=['deal_probability'])
train_len = train_df.shape[0]
train_y = train_df['deal_probability'].values
del train_df
print(train_len)

1503424


In [4]:
print(len(train_y))

1503424


In [5]:
tf_idf_train_features = tf_idf_features[:train_len, :] 
tf_idf_test_features = tf_idf_features[train_len:, :] 

In [6]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, GridSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from copy import deepcopy as cp

In [7]:
def clip_rmse(ground_truth, predictions):
    predictions = np.clip(predictions, 0., 1.)
    return mean_squared_error(ground_truth, predictions)**.5

clip_rmse_scorer = make_scorer(clip_rmse, greater_is_better=False)

In [8]:
kfold = KFold(n_splits=3, random_state=719)

#### FM_FTRL

In [9]:
from wordbatch.models import FM_FTRL

In [10]:
def simple_train_test_eval(default_params, X, y, params):
    tr_X, val_X, tr_y, val_y = train_test_split(X, y, test_size=0.3, random_state=719)
    
    min_score = None
    best_param = None
    
    for param in list(ParameterGrid(params)):
        use_params = cp(default_params)
        use_params.update(param)
        print('Fitting params:\n', use_params)
        md = FM_FTRL(**use_params)
        md.fit(tr_X, tr_y)
        score = clip_rmse(val_y, md.predict(val_X))
        print(param, score)
        
        if min_score is None or score < min_score:
            best_param = param
            min_score = score
            
    print('Best param:', best_param, '\nscore:', min_score)

In [13]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .005,
    'L1': 0.0001,
    'L2': 0.1,
    'D': tf_idf_train_features.shape[1],
    'D_fm': 20,
    'iters': 5,
    'seed': 719,
    'threads': 4,
    'verbose': 0
}

try_params = {
    'L1': [0.1, 0.01, 0.001, 0.0001],
    'L2': [0.1, 0.01, 0.001, 0.0001]
}
simple_train_test_eval(fmftrl_default_params, tf_idf_train_features, train_y, try_params)

Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.1, 'L2': 0.1, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'L1': 0.1, 'L2': 0.1} 0.240243186272
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.1, 'L2': 0.01, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'L1': 0.1, 'L2': 0.01} 0.240105686946
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.1, 'L2': 0.001, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'L1': 0.1, 'L2': 0.001} 0.239828209177
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.1, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'L1': 0.1, 'L2': 0.0001} 0.239985170951
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.01, 'L2': 0.1, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'L1': 0.01, 'L2': 0.1} 0.239949009058
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.01, 'L2': 0.01,

In [14]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .005,
    'L1': 0.0001,
    'L2': 0.0001,
    'D': tf_idf_train_features.shape[1],
    'D_fm': 20,
    'iters': 5,
    'seed': 719,
    'threads': 4,
    'verbose': 0
}

try_params = [
    {
        'alpha': [0.1],
        'beta': [0.05]
    },
    {
        'alpha': [0.01],
        'beta': [0.005]
    },
    {
        'alpha': [0.001],
        'beta': [0.0005]
    },
    {
        'alpha': [0.0001],
        'beta': [0.00005]
    }
]
simple_train_test_eval(fmftrl_default_params, tf_idf_train_features, train_y, try_params)

Fitting params:
 {'alpha': 0.1, 'beta': 0.05, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'alpha': 0.1, 'beta': 0.05} 0.240259657599
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'alpha': 0.01, 'beta': 0.005} 0.239758109829
Fitting params:
 {'alpha': 0.001, 'beta': 0.0005, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'alpha': 0.001, 'beta': 0.0005} 0.240187278081
Fitting params:
 {'alpha': 0.0001, 'beta': 5e-05, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'alpha': 0.0001, 'beta': 5e-05} 0.240750027105
Best param: {'alpha': 0.01, 'beta': 0.005} 
score: 0.239758109829


In [15]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .005,
    'L1': 0.0001,
    'L2': 0.0001,
    'D': tf_idf_train_features.shape[1],
    'D_fm': 20,
    'iters': 5,
    'seed': 719,
    'threads': 4,
    'verbose': 0
}

try_params = {
        'D_fm': [20, 40, 60, 80, 100],
        'iters': [5, 10, 15, 20]
    }
simple_train_test_eval(fmftrl_default_params, tf_idf_train_features, train_y, try_params)

Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'D_fm': 20, 'iters': 5} 0.239758109829
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 0}
{'D_fm': 20, 'iters': 10} 0.239291765534
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 15, 'seed': 719, 'threads': 4, 'verbose': 0}
{'D_fm': 20, 'iters': 15} 0.239057437083
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 20, 'iters': 20, 'seed': 719, 'threads': 4, 'verbose': 0}
{'D_fm': 20, 'iters': 20} 0.238886569832
Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.0001, 'L2': 0.0001, 'D': 100, 'D_fm': 40, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 0}
{'D_fm': 40, 'iters': 5} 0.240080806835
Fitting params:
 {'alpha': 0.01, 'beta'

In [None]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .005,
    'L1': 0.0001,
    'L2': 0.0001,
    'D': tf_idf_train_features.shape[1],
    'D_fm': 20,
    'iters': 5,
    'seed': 719,
    'threads': 4,
    'verbose': 0
}

try_params = {
        'D_fm': [60, 80, 100, 120],
        'iters': [20, 40, 60]
    }
simple_train_test_eval(fmftrl_default_params, tf_idf_train_features, train_y, try_params)

#### Ridge

In [9]:
md = Ridge(random_state=719, solver='sag')

params = {
    'alpha': [0.1, 1.0, 2.0],
}

gs = GridSearchCV(md, params, scoring=clip_rmse_scorer, cv=kfold, n_jobs=1, verbose=10)
gs.fit(tf_idf_train_features, train_y)
print(gs.best_params_, gs.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] alpha=0.1 .......................................................
[CV] ............ alpha=0.1, score=-0.24091356494121902, total=  30.0s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.3s remaining:    0.0s


[CV] ............ alpha=0.1, score=-0.24093005881913193, total=  30.3s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


[CV] ............ alpha=0.1, score=-0.24181288314665775, total=  28.7s
[CV] alpha=1.0 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.5min remaining:    0.0s


[CV] ............ alpha=1.0, score=-0.24091357123186008, total=  29.0s
[CV] alpha=1.0 .......................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.0min remaining:    0.0s


[CV] ............ alpha=1.0, score=-0.24093006272763165, total=  24.4s
[CV] alpha=1.0 .......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min remaining:    0.0s


[CV] ............ alpha=1.0, score=-0.24181288861936515, total=  33.1s
[CV] alpha=2.0 .......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.0min remaining:    0.0s


[CV] ............. alpha=2.0, score=-0.2409135782229551, total=  29.3s
[CV] alpha=2.0 .......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.5min remaining:    0.0s


[CV] ............ alpha=2.0, score=-0.24093006707199455, total=  24.5s
[CV] alpha=2.0 .......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min remaining:    0.0s


[CV] ............ alpha=2.0, score=-0.24181289470174847, total=  27.7s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.3min finished


{'alpha': 0.1} -0.241218835433


In [10]:
md = Ridge(random_state=719, solver='sag')

params = {
    'alpha': [.05, .1, .15, .3, .5],
}

gs = GridSearchCV(md, params, scoring=clip_rmse_scorer, cv=kfold, n_jobs=1, verbose=10)
gs.fit(tf_idf_train_features, train_y)
print(gs.best_params_, gs.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] alpha=0.05 ......................................................
[CV] ........... alpha=0.05, score=-0.24091356459176744, total=  29.9s
[CV] alpha=0.05 ......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.4s remaining:    0.0s


[CV] ............ alpha=0.05, score=-0.2409300586020397, total=  25.2s
[CV] alpha=0.05 ......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   55.9s remaining:    0.0s


[CV] ............ alpha=0.05, score=-0.2418128828426633, total=  27.8s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.4min remaining:    0.0s


[CV] ............ alpha=0.1, score=-0.24091356494121902, total=  35.0s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.0min remaining:    0.0s


[CV] ............ alpha=0.1, score=-0.24093005881913193, total=  24.8s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min remaining:    0.0s


[CV] ............ alpha=0.1, score=-0.24181288314665775, total=  27.7s
[CV] alpha=0.15 ......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.9min remaining:    0.0s


[CV] ........... alpha=0.15, score=-0.24091356529063682, total=  29.1s
[CV] alpha=0.15 ......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.4min remaining:    0.0s


[CV] ............ alpha=0.15, score=-0.2409300590362524, total=  29.6s
[CV] alpha=0.15 ......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min remaining:    0.0s


[CV] ............. alpha=0.15, score=-0.241812883450657, total=  27.9s
[CV] alpha=0.3 .......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.3min remaining:    0.0s


[CV] ............. alpha=0.3, score=-0.2409135663390319, total=  29.5s
[CV] alpha=0.3 .......................................................
[CV] ............... alpha=0.3, score=-0.24093005968757, total=  24.8s
[CV] alpha=0.3 .......................................................
[CV] ............ alpha=0.3, score=-0.24181288436271664, total=  33.3s
[CV] alpha=0.5 .......................................................
[CV] ............ alpha=0.5, score=-0.24091356773688535, total=  29.5s
[CV] alpha=0.5 .......................................................
[CV] ............ alpha=0.5, score=-0.24093006055606417, total=  24.6s
[CV] alpha=0.5 .......................................................
[CV] ............ alpha=0.5, score=-0.24181288557881492, total=  27.7s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  7.2min finished


{'alpha': 0.05} -0.241218835142


#### XGB

In [9]:
import xgboost as xgb

In [10]:
def simple_train_test_eval(model, X, y, params):
    tr_X, val_X, tr_y, val_y = train_test_split(X, y, test_size=0.3, random_state=719)
    
    min_score = None
    best_param = None
    
    for param in list(ParameterGrid(params)):
        md = cp(model)
        old_params = md.get_params()
        old_params.update(param)
        print('Fitting params:\n', old_params)
        md.set_params(**old_params)
        md.fit(tr_X, tr_y)
        score = clip_rmse(val_y, md.predict(val_X))
        print(param, score)
        
        if min_score is None or score < min_score:
            best_param = param
            min_score = score
            
    print('Best param:', best_param, '\nscore:', min_score)

In [11]:
md = xgb.XGBRegressor(random_state=719, subsample=.8, colsample_bytree=.8, n_estimators=100, 
                    max_depth=6, n_jobs=4, min_child_weight=1, gamma=0,
                    objective='reg:logistic')

params = {
    'booster': ['dart', 'gbtree']
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'dart', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'booster': 'dart'} 0.237842611006
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'booster': 'gbtree'} 0.237784995836
Best param: {'booster': 'gbtree'} 
score: 0.237784995836


In [16]:
md = xgb.XGBRegressor(random_state=719, subsample=.8, colsample_bytree=.8, n_estimators=100, 
                    max_depth=6, n_jobs=4, min_child_weight=1, gamma=0, booster='gbtree',
                    objective='reg:logistic')

params = {
    'n_estimators': [40, 60, 80, 100, 120]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': None, 'n_estimators': 40, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'n_estimators': 40} 0.239735984725
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': None, 'n_estimators': 60, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'n_estimators': 60} 0.238620725891
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'c

In [17]:
md = xgb.XGBRegressor(random_state=719, subsample=.8, colsample_bytree=.8, n_estimators=120, 
                    max_depth=6, n_jobs=4, min_child_weight=1, gamma=0, booster='gbtree',
                    objective='reg:logistic')

params = {
    'max_depth': [4, 6, 8, 10, 12]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 1, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'max_depth': 4} 0.238942151296
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'max_depth': 6} 0.237603242063
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsamp

In [12]:
md = xgb.XGBRegressor(random_state=719, subsample=.8, colsample_bytree=.8, n_estimators=120, 
                    max_depth=10, n_jobs=4, min_child_weight=1, gamma=0, booster='gbtree',
                    objective='reg:logistic')

params = {
    'min_child_weight': [1, 2, 4, 8],
    'gamma': [0, 0.1, 0.2, 0.4]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 1, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'gamma': 0, 'min_child_weight': 1} 0.236363524498
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 2, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'gamma': 0, 'min_child_weight': 2} 0.236358908135
Fitting params:
 {'base_score': 0.5, 'booster': 'g

In [13]:
md = xgb.XGBRegressor(random_state=719, subsample=.8, colsample_bytree=.8, n_estimators=120, 
                    max_depth=10, n_jobs=4, min_child_weight=8, gamma=0.2, booster='gbtree',
                    objective='reg:logistic')

params = {
    'subsample': [1., .8, .6, .4, .2],
    'colsample_bytree': [1., .8, .6, .4, .2]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1.0, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 1.0}
{'colsample_bytree': 1.0, 'subsample': 1.0} 0.236278803474
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1.0, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.8}
{'colsample_bytree': 1.0, 'subsample': 0.8} 0.236316606787
Fitting params:
 {'base_score'

{'colsample_bytree': 0.4, 'subsample': 0.6} 0.236532930435
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.4, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.4}
{'colsample_bytree': 0.4, 'subsample': 0.4} 0.236823774288
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.4, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.2}
{'colsample_bytree': 0.4, 'sub

In [15]:
md = xgb.XGBRegressor(random_state=719, subsample=.8, colsample_bytree=.8, n_estimators=120, 
                    max_depth=10, n_jobs=4, min_child_weight=8, gamma=0.2, booster='gbtree',
                    objective='reg:logistic')

params = {
    'subsample': [1., .95, .9, .85],
    'colsample_bytree': [.95, .9, .85, .8, .75, .7, .65]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.95, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 1.0}
{'colsample_bytree': 0.95, 'subsample': 1.0} 0.236227971495
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.95, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'colsample_bytree': 0.95, 'subsample': 0.95} 0.23623979139
Fitting params:
 {'base_s

{'colsample_bytree': 0.75, 'subsample': 1.0} 0.236244632736
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.75, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'colsample_bytree': 0.75, 'subsample': 0.95} 0.236195506828
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.75, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.9}
{'colsample_bytree': 0.7

In [16]:
md = xgb.XGBRegressor(random_state=719, subsample=.95, colsample_bytree=.65, n_estimators=120, 
                    max_depth=10, n_jobs=4, min_child_weight=8, gamma=0.2, booster='gbtree',
                    objective='reg:logistic')

params = {
    'reg_alpha': [1., .8, .6, .4, .2, .0],
    'reg_lambda': [1., .8, .6, .4, .2, .0]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 1.0, 'reg_lambda': 1.0} 0.23624219451
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 1.0, 'reg_lambda': 0.8, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 1.0, 'reg_lambda': 0.8} 0.236246422688
Fitting params:
 {'base_score':

{'reg_alpha': 0.6, 'reg_lambda': 0.0} 0.236237412838
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.4, 'reg_lambda': 1.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 0.4, 'reg_lambda': 1.0} 0.236226789748
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.4, 'reg_lambda': 0.8, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 0.4, 'reg_lambda

{'reg_alpha': 0.0, 'reg_lambda': 0.2} 0.236334168732
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 0.0, 'reg_lambda': 0.0} 0.236286693527
Best param: {'reg_alpha': 0.6, 'reg_lambda': 1.0} 
score: 0.236200425001


In [17]:
md = xgb.XGBRegressor(random_state=719, subsample=.95, colsample_bytree=.65, n_estimators=120, 
                    max_depth=10, n_jobs=4, min_child_weight=8, gamma=0.2, booster='gbtree',
                    objective='reg:logistic')

params = {
    'reg_alpha': [.75, .7, .65, .6, .55, .5, .45],
    'reg_lambda': [1., .95, .9, .85]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.75, 'reg_lambda': 1.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 0.75, 'reg_lambda': 1.0} 0.236255114834
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.75, 'reg_lambda': 0.95, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 0.75, 'reg_lambda': 0.95} 0.236208395349
Fitting params:
 {'base_

{'reg_alpha': 0.55, 'reg_lambda': 1.0} 0.236290117561
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.55, 'reg_lambda': 0.95, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 0.55, 'reg_lambda': 0.95} 0.236238058874
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.55, 'reg_lambda': 0.9, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'reg_alpha': 0.55, 'reg

In [18]:
md = xgb.XGBRegressor(random_state=719, subsample=.95, colsample_bytree=.65, n_estimators=120, 
                    max_depth=10, n_jobs=4, min_child_weight=8, gamma=0.2, booster='gbtree',
                    objective='reg:logistic', reg_alpha=.6, reg_lambda=1.)

params = {
    'n_estimators': [120, 160, 200, 300, 400],
    'learning_rate': [0.1, .05, .001]
}

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 120, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.6, 'reg_lambda': 1.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'learning_rate': 0.1, 'n_estimators': 120} 0.236200425001
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 160, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.6, 'reg_lambda': 1.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'learning_rate': 0.1, 'n_estimators': 160} 0.236083775525
Fitting params:
 {

KeyboardInterrupt: 

In [11]:
md = xgb.XGBRegressor(random_state=719, subsample=.95, colsample_bytree=.65, n_estimators=120, 
                    max_depth=10, n_jobs=4, min_child_weight=8, gamma=0.2, booster='gbtree',
                    objective='reg:logistic', reg_alpha=.6, reg_lambda=1.)

params = [
    {
        'n_estimators': [300, 400],
        'learning_rate': [.05]
    },
    {
        'n_estimators': [300, 400],
        'learning_rate': [.001]
    }
]

simple_train_test_eval(md, tf_idf_train_features, train_y, params)

Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 300, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.6, 'reg_lambda': 1.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'learning_rate': 0.05, 'n_estimators': 300} 0.235845330163
Fitting params:
 {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.65, 'gamma': 0.2, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 8, 'missing': None, 'n_estimators': 400, 'n_jobs': 4, 'nthread': None, 'objective': 'reg:logistic', 'random_state': 719, 'reg_alpha': 0.6, 'reg_lambda': 1.0, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.95}
{'learning_rate': 0.05, 'n_estimators': 400} 0.235708395236
Fitting params

## Stage 6  Gen OOF Predictions and Top 10 TFIDF Features

In [7]:
from sklearn.linear_model import Ridge
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from copy import deepcopy as cp
import pickle
import gc; gc.enable()
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np

In [3]:
kf = KFold(random_state=719, n_splits=5)

In [5]:
def gen_oof_predictions(model, X, y, test_X):
    oof_pred = np.zeros((X.shape[0],))
    oof_test_pred = np.zeros((test_X.shape[0],))
    
    counter = 1
    for train_index, val_index in kf.split(X):
        md = cp(model)
        md.fit(X[train_index], y[train_index])
        oof_pred[val_index] = md.predict(X[val_index])
        oof_test_pred += md.predict(test_X)
        counter += 1
        
    oof_test_pred /= counter
    return {
        'oof_pred': oof_pred,
        'oof_test_pred': oof_test_pred
    }

In [8]:
with open('text_all_features_svd.pickle', 'rb') as handle:
    tf_idf_features = pickle.load(handle)

train_df = pd.read_csv("data/train.csv", usecols=['deal_probability'])
train_len = train_df.shape[0]
train_y = train_df['deal_probability'].values
del train_df
print(train_len)

tf_idf_train_features = tf_idf_features[:train_len,:]
tf_idf_test_features = tf_idf_features[train_len:,:]

1503424


In [10]:
model_list = [('ridge', Ridge(alpha=.05)),
              ('xgb', xgb.XGBRegressor(random_state=719, subsample=.95, colsample_bytree=.65, 
                                       learning_rate=0.05, n_estimators=120, 
                                       max_depth=10, n_jobs=4, min_child_weight=8, gamma=0.2, booster='gbtree',
                                       objective='reg:logistic', reg_alpha=.6, reg_lambda=1.))]

oof_df = pd.DataFrame()
oof_test_df = pd.DataFrame()
for model_name, model in model_list:
    res = gen_oof_predictions(model, tf_idf_train_features, train_y, tf_idf_test_features)
    oof_df[model_name + '_oof_pred'] = res['oof_pred']
    oof_test_df[model_name + '_oof_pred'] = res['oof_test_pred']
    
oof_df.to_csv('text_oof_pred.csv', index=False)
oof_test_df.to_csv('text_oof_pred_test.csv', index=False)

In [11]:
with open('text_all_features.pickle', 'rb') as handle:
    tf_idf_features = pickle.load(handle)
    
tf_idf_features = tf_idf_features[:, np.array(np.clip(tf_idf_features.getnnz(axis=0) - 5, 0, 1), dtype=bool)]; gc.collect()

top_N=10
tf_idf_features = TruncatedSVD(n_components=top_N, random_state=719, algorithm='arpack').fit_transform(tf_idf_features)


TypeError: __init__() got an unexpected keyword argument 'column'

In [12]:
feature_cols = ['f_'+str(i+1) for i in range(top_N)]
pd.DataFrame(data=tf_idf_features[:train_len,:], columns=feature_cols).to_csv('tf_idf_top_{}_train.csv'.format(top_N), index=False)
pd.DataFrame(data=tf_idf_features[train_len:,:], columns=feature_cols).to_csv('tf_idf_top_{}_test.csv'.format(top_N), index=False)