In [1]:
import numpy as np
import pandas as pd
import os
import gc
import re
import string
import time
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import hstack, csr_matrix

import nltk
from nltk.corpus import stopwords 
import wordbatch
from wordbatch.extractors import WordBag
from wordbatch.models import FM_FTRL

from myutils import reduce_mem_usage


nltk.download('stopwords')
lentrain = 1503424
#stopwords_kernel = {x: 1 for x in stopwords.words('russian')}
stopwords_kernel = list(set(stopwords.words('russian')))
non_alphanums = re.compile(u'[^A-Za-z0-9]+')
non_alphanumpunct = re.compile(u'[^A-Za-z0-9\.?!,; \(\)\[\]\'\"\$]+')
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])
stop_words = list(set(stopwords.words('russian')))
russian_stop = set(stopwords.words('russian'))
punctuation = string.punctuation

textfeats = ['title', 'description']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\osk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def normalize_text(text):
    text = text.lower().strip()
    for s in string.punctuation:
        text = text.replace(s, ' ')
    text = text.strip().split(' ')
    return u' '.join(x for x in text if len(x) > 1 and x not in stopwords_kernel)

def rmse(predicted, actual):
    return np.sqrt(((predicted - actual) ** 2).mean())

def cleanName(text):
    try:
        textProc = text.lower()
        # textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        #regex = re.compile(u'[^[:alpha:]]')
        #textProc = regex.sub(" ", textProc)
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"

In [5]:
train = pd.read_csv('../input/train.csv', usecols=['description', 'title'])
test = pd.read_csv('../input/test.csv', usecols=['description', 'title'])

print(train.shape)
print(test.shape)

(1503424, 2)
(508438, 2)


In [25]:
%%time
df = pd.concat([train, test])
print(df.shape)

(2011862, 2)
Wall time: 67.2 ms


In [26]:
for col in textfeats:
    df[col] = df[col].astype(str)
    df[col] = df[col].fillna('missing')
    df[col + '_titleword_count'] = df[col].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
    df[col + '_upper_case_word_count'] = df[col].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

    df[col] = df[col].str.lower()
    df[col + '_num_stopwords'] = df[col].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
    df[col + '_num_punctuations'] = df[col].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))
    df[col + '_num_alphabets'] = df[col].apply(lambda comment: len([c for c in comment if c.isupper()]))
    df[col + '_num_digits'] = df[col].apply(lambda comment: (comment.count('[0-9]')))
    df[col + '_num_chars'] = df[col].apply(len) # Count number of Characters
    df[col + '_num_words'] = df[col].apply(lambda comment: len(comment.split())) # Count number of Words
    df[col + '_num_unique_words'] = df[col].apply(lambda comment: len(set(w for w in comment.split())))
    df[col + '_chars_by_words'] = df[col + '_num_chars'] / (df[col + '_num_words'] + 1)
    df[col + '_words_by_uniquewords'] = df[col + '_num_unique_words'] / (df[col+'_num_words'] + 1)
    df[col + '_punctuations_by_chars'] = df[col+'_num_punctuations'] / (df[col + '_num_chars'] + 1)
    df[col + '_punctuations_by_words'] = df[col + '_num_punctuations'] / (df[col + '_num_words'] + 1)
    df[col + '_digits_by_chars'] = df[col + '_num_digits'] / (df[col + '_num_chars'] + 1)
    df[col + '_alphabets_by_chars'] = df[col + '_num_alphabets'] / (df[col + '_num_chars'] + 1)
    df[col + '_stopwords_by_words'] = df[col + '_num_stopwords'] / (df[col + '_num_words'] + 1)
    df[col + '_mean'] = df[col].apply(lambda x: 0 if len(x) == 0 else float(len(x.split())) / len(x)) * 10
print(df.columns)
df['title_description_len_ratio'] = (df['title_num_chars'].astype(np.float)) / (df['description_num_chars'].astype(np.float) + 1)
df.head()

Index(['title', 'description', 'title_titleword_count',
       'title_upper_case_word_count', 'title_num_stopwords',
       'title_num_punctuations', 'title_num_alphabets', 'title_num_digits',
       'title_num_chars', 'title_num_words', 'title_num_unique_words',
       'title_chars_by_words', 'title_words_by_uniquewords',
       'title_punctuations_by_chars', 'title_punctuations_by_words',
       'title_digits_by_chars', 'title_alphabets_by_chars',
       'title_stopwords_by_words', 'title_mean', 'description_titleword_count',
       'description_upper_case_word_count', 'description_num_stopwords',
       'description_num_punctuations', 'description_num_alphabets',
       'description_num_digits', 'description_num_chars',
       'description_num_words', 'description_num_unique_words',
       'description_chars_by_words', 'description_words_by_uniquewords',
       'description_punctuations_by_chars',
       'description_punctuations_by_words', 'description_digits_by_chars',
       'des

Unnamed: 0,title,description,title_titleword_count,title_upper_case_word_count,title_num_stopwords,title_num_punctuations,title_num_alphabets,title_num_digits,title_num_chars,title_num_words,...,description_num_unique_words,description_chars_by_words,description_words_by_uniquewords,description_punctuations_by_chars,description_punctuations_by_words,description_digits_by_chars,description_alphabets_by_chars,description_stopwords_by_words,description_mean,title_description_len_ratio
0,кокоби(кокон для сна),"кокон для сна малыша,пользовались меньше месяц...",0,0,1,2,0,0,21,3,...,7,7.25,0.875,0.033898,0.25,0.0,0.0,0.125,1.206897,0.355932
1,стойка для одежды,"стойка для одежды, под вешалки. с бутика.",2,0,1,0,0,0,17,3,...,7,5.125,0.875,0.071429,0.375,0.0,0.0,0.375,1.707317,0.404762
2,philips bluray,"в хорошем состоянии, домашний кинотеатр с blu ...",1,0,0,0,0,0,14,2,...,17,5.5,0.944444,0.05,0.277778,0.0,0.0,0.222222,1.717172,0.14
3,автокресло,продам кресло от0-25кг,1,0,0,0,0,0,10,1,...,3,5.5,0.75,0.043478,0.25,0.0,0.0,0.0,1.363636,0.434783
4,"ваз 2110, 2003",все вопросы по телефону.,0,1,0,1,0,0,14,3,...,4,4.8,0.8,0.04,0.2,0.0,0.0,0.4,1.666667,0.56


In [28]:
print(df.shape)
df = df.drop(['title', 'description'], axis=1)
df = reduce_mem_usage(df)

(2011862, 37)
Memory usage of dataframe is 592.57 MB
Memory usage after optimization is: 233.78 MB
Decreased by 60.5%


In [31]:
train = df[:lentrain]
test = df[lentrain:]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print(train.shape)
print(test.shape)

train.to_feather('../features/train/textfeatures_train.feather')
test.to_feather('../features/test/textfeatures_test.feather')


(1503424, 35)
(508438, 35)


In [3]:
# Wordbatch
train = pd.read_csv('../input/train.csv', usecols=['description', 'title'])
test = pd.read_csv('../input/test.csv', usecols=['description', 'title'])

for col in textfeats:
    train[col] = train[col].astype(str)
    train[col] = train[col].fillna('missing')
    train[col] = train[col].str.lower()
    train[col] = train[col].apply(lambda x: cleanName(x))
for col in textfeats:
    test[col] = test[col].astype(str)
    test[col] = test[col].fillna('missing')
    test[col] = test[col].str.lower()
    test[col] = test[col].apply(lambda x: cleanName(x))

print(train.shape)
print(test.shape)

(1503424, 2)
(508438, 2)


In [None]:
%%time
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2,
                                                              "hash_ngrams_weights": [1.5, 1.0],
                                                              "hash_size": 2 ** 29,
                                                              "norm": None,
                                                              "tf": 'binary',
                                                              "idf": None,
                                                              }), procs=8)
wb.dictionary_freeze = True
X_name_train = wb.fit_transform(train['title'])
print(X_name_train.shape)
X_name_test = wb.transform(test['title'])
print(X_name_test.shape)
del(wb)
gc.collect()

Normalize text
Parallelization fail. Method: multiprocessing Task: <function batch_normalize_texts at 0x000001D382BA5D08>
Retrying, attempt: 1 timeout limit: 1200 seconds


In [4]:
%%time
mask = np.where(X_name_train.getnnz(axis=0) > 3)[0]
X_name_train = X_name_train[:, mask]
print(X_name_train.shape)
X_name_test = X_name_test[:, mask]
print(X_name_test.shape)

(1503424, 152827)
(508438, 152827)
Wall time: 9.17 s


In [3]:
# From pickle file
with open('./wordbatch_title_train.pickle', 'rb') as f:
    X_name_train = pickle.load(f)
with open('./wordbatch_title_test.pickle', 'rb') as f:
    X_name_test = pickle.load(f)

print(X_name_train.shape)
print(X_name_test.shape)

(1503424, 167044)
(508438, 167044)


In [4]:
y = pd.read_csv('../input/train.csv', usecols=['deal_probability'])

X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_name_train,
                                                              y,
                                                              test_size = 0.5,
                                                              shuffle = False)
model = Ridge(solver="sag", fit_intercept=True, random_state=42, alpha=5)
model.fit(X_train_1, y_train_1)
train_ridge = model.predict(X_name_train)
test_ridge = model.predict(X_name_test)
print(rmse(model.predict(X_train_2), y_train_2))

model = Ridge(solver="sag", fit_intercept=True, random_state=4882, alpha=5)
model.fit(X_train_2, y_train_2)
train_ridge += model.predict(X_name_train)
test_ridge += model.predict(X_name_test)
print(rmse(model.predict(X_train_1), y_train_1))

train_ridge /= 2.0
test_ridge /= 2.0

deal_probability    0.235494
dtype: float64
deal_probability    0.235297
dtype: float64


In [5]:
train_ridgedf = pd.DataFrame()
train_ridgedf['wordbach_title_ridge'] = train_ridge.flatten()
test_ridgedf = pd.DataFrame()
test_ridgedf['wordbach_title_ridge'] = test_ridge.flatten()

print(train_ridgedf.shape)
print(test_ridgedf.shape)

train_ridgedf.head()

(1503424, 1)
(508438, 1)


Unnamed: 0,wordbach_title_ridge
0,0.061688
1,0.255343
2,0.155275
3,0.417688
4,0.361951


In [6]:
train_ridgedf.reset_index(drop=True, inplace=True)
test_ridgedf.reset_index(drop=True, inplace=True)

train_ridgedf = reduce_mem_usage(train_ridgedf)
test_ridgedf = reduce_mem_usage(test_ridgedf)

print(train_ridgedf.shape)
print(test_ridgedf.shape)

train_ridgedf.to_feather('../features/train/wordbatch_title_ridge_train.feather')
test_ridgedf.to_feather('../features/test/wordbatch_title_ridge_test.feather')

Memory usage of dataframe is 11.47 MB
Memory usage after optimization is: 5.74 MB
Decreased by 50.0%
Memory usage of dataframe is 3.88 MB
Memory usage after optimization is: 1.94 MB
Decreased by 50.0%
(1503424, 1)
(508438, 1)


In [7]:
del train_ridgedf, test_ridgedf, X_train_1, X_train_2, y_train_1, y_train_2, train_ridge, test_ridge
gc.collect()

7

In [8]:
n_comp = 7
tsvd = TruncatedSVD(n_components=n_comp, algorithm='arpack')
tsvd.fit(X_name_train)

train_svd = pd.DataFrame(tsvd.transform(X_name_train))
test_svd = pd.DataFrame(tsvd.transform(X_name_test))
train_svd.columns = ['svd_wordbatch_title_'+str(i+1) for i in range(n_comp)]
test_svd.columns =  ['svd_wordbatch_title_'+str(i+1) for i in range(n_comp)]

print(train_svd.shape)
print(test_svd.shape)
train_svd.head()


(1503424, 7)
(508438, 7)


Unnamed: 0,svd_wordbatch_title_1,svd_wordbatch_title_2,svd_wordbatch_title_3,svd_wordbatch_title_4,svd_wordbatch_title_5,svd_wordbatch_title_6,svd_wordbatch_title_7
0,1.769956,-0.179334,-0.21295,0.023352,-0.000302,-0.032338,-0.048753
1,1.878044,-0.134059,-0.2423,0.022084,-0.011211,0.042688,-0.023846
2,1.772842,-0.179895,-0.212869,0.0227,-0.000354,-0.032753,-0.048281
3,1.816381,-0.18572,-0.222293,0.0239,-4.4e-05,-0.038439,-0.061262
4,0.000783,-4.1e-05,-0.000222,8.4e-05,-9.2e-05,0.000755,0.000959


In [9]:
train_svd.reset_index(drop=True, inplace=True)
test_svd.reset_index(drop=True, inplace=True)
train_svd = reduce_mem_usage(train_svd)
test_svd = reduce_mem_usage(test_svd)
print(train_svd.shape)
print(test_svd.shape)
train_svd.to_feather('../features/train/wordbatch_title_tsvd_train.feather')
test_svd.to_feather('../features/test/wordbatch_title_tsvd_test.feather')

Memory usage of dataframe is 80.29 MB
Memory usage after optimization is: 40.15 MB
Decreased by 50.0%
Memory usage of dataframe is 27.15 MB
Memory usage after optimization is: 13.58 MB
Decreased by 50.0%
(1503424, 7)
(508438, 7)


In [4]:
# Wordbatch
train = pd.read_csv('../input/train.csv', usecols=['description', 'title'])
test = pd.read_csv('../input/test.csv', usecols=['description', 'title'])

for col in textfeats:
    train[col] = train[col].astype(str)
    train[col] = train[col].fillna('missing')
    train[col] = train[col].str.lower()
    train[col] = train[col].apply(lambda x: cleanName(x))
for col in textfeats:
    test[col] = test[col].astype(str)
    test[col] = test[col].fillna('missing')
    test[col] = test[col].str.lower()
    test[col] = test[col].apply(lambda x: cleanName(x))

print(train.shape)
print(test.shape)

(1503424, 2)
(508438, 2)


In [None]:
%%time
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2,
                                                              "hash_ngrams_weights": [1.0, 1.0],
                                                              "hash_size": 2 ** 28,
                                                              "norm": "l2",
                                                              "tf": 1.0,
                                                              "idf": None,
                                                              }), procs=8)
wb.dictionary_freeze = True
X_desc_train = wb.fit_transform(train['description'])
print(X_desc_train.shape)
X_desc_test = wb.transform(test['description'])
print(X_desc_test.shape)
del(wb)
gc.collect()

Normalize text


In [None]:
%%time
mask = np.where(X_desc_train.getnnz(axis=0) > 3)[0]
X_desc_train = X_desc_train[:, mask]
print(X_desc_train.shape)
X_desc_test = X_desc_test[:, mask]
print(X_desc_test.shape)

In [10]:
# From pickle file
with open('./wordbatch_description_train.pickle', 'rb') as f:
    X_desc_train = pickle.load(f)
with open('./wordbatch_description_test.pickle', 'rb') as f:
    X_desc_test = pickle.load(f)

print(X_desc_train.shape)
print(X_desc_test.shape)

(1503424, 1167053)
(508438, 1167053)


In [11]:
y = pd.read_csv('../input/train.csv', usecols=['deal_probability'])

X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_desc_train,
                                                              y,
                                                              test_size = 0.5,
                                                              shuffle = False)
model = Ridge(solver="sag", fit_intercept=True, random_state=42, alpha=5)
model.fit(X_train_1, y_train_1)
train_ridge = model.predict(X_desc_train)
test_ridge = model.predict(X_desc_test)
print(rmse(model.predict(X_train_2), y_train_2))

model = Ridge(solver="sag", fit_intercept=True, random_state=4882, alpha=5)
model.fit(X_train_2, y_train_2)
train_ridge += model.predict(X_desc_train)
test_ridge += model.predict(X_desc_test)
print(rmse(model.predict(X_train_1), y_train_1))

train_ridge /= 2.0
test_ridge /= 2.0

deal_probability    0.231224
dtype: float64
deal_probability    0.23102
dtype: float64


In [12]:
train_ridgedf = pd.DataFrame()
train_ridgedf['wordbach_description_ridge'] = train_ridge.flatten()
test_ridgedf = pd.DataFrame()
test_ridgedf['wordbach_description_ridge'] = test_ridge.flatten()

print(train_ridgedf.shape)
print(test_ridgedf.shape)

train_ridgedf.head()

(1503424, 1)
(508438, 1)


Unnamed: 0,wordbach_description_ridge
0,0.133309
1,0.137209
2,0.191281
3,0.4334
4,0.358012


In [13]:
train_ridgedf.reset_index(drop=True, inplace=True)
test_ridgedf.reset_index(drop=True, inplace=True)

train_ridgedf = reduce_mem_usage(train_ridgedf)
test_ridgedf = reduce_mem_usage(test_ridgedf)

print(train_ridgedf.shape)
print(test_ridgedf.shape)

train_ridgedf.to_feather('../features/train/wordbatch_description_ridge_train.feather')
test_ridgedf.to_feather('../features/test/wordbatch_description_ridge_test.feather')

Memory usage of dataframe is 11.47 MB
Memory usage after optimization is: 5.74 MB
Decreased by 50.0%
Memory usage of dataframe is 3.88 MB
Memory usage after optimization is: 1.94 MB
Decreased by 50.0%
(1503424, 1)
(508438, 1)


In [14]:
del train_ridgedf, test_ridgedf, X_train_1, X_train_2, y_train_1, y_train_2, train_ridge, test_ridge
gc.collect()

7

In [15]:
n_comp = 7
tsvd = TruncatedSVD(n_components=n_comp, algorithm='arpack')
tsvd.fit(X_desc_train)

train_svd = pd.DataFrame(tsvd.transform(X_desc_train))
test_svd = pd.DataFrame(tsvd.transform(X_desc_test))
train_svd.columns = ['svd_wordbatch_description_'+str(i+1) for i in range(n_comp)]
test_svd.columns =  ['svd_wordbatch_description_'+str(i+1) for i in range(n_comp)]

print(train_svd.shape)
print(test_svd.shape)
train_svd.head()


(1503424, 7)
(508438, 7)


Unnamed: 0,svd_wordbatch_description_1,svd_wordbatch_description_2,svd_wordbatch_description_3,svd_wordbatch_description_4,svd_wordbatch_description_5,svd_wordbatch_description_6,svd_wordbatch_description_7
0,0.010941,-0.004648,0.011228,0.005028,-0.009737,-0.002559,0.007509
1,0.001629,-0.000355,0.001153,0.000376,-0.001365,0.000309,0.00103
2,0.226791,-0.13434,-0.129719,-0.164746,-0.053011,-0.002977,-0.027714
3,0.125448,-0.04612,0.223455,-0.043587,0.18558,-0.065177,-0.037679
4,0.019432,-0.010822,0.013322,-0.010144,0.00083,0.023814,-0.011581


In [16]:
train_svd.reset_index(drop=True, inplace=True)
test_svd.reset_index(drop=True, inplace=True)
train_svd = reduce_mem_usage(train_svd)
test_svd = reduce_mem_usage(test_svd)
print(train_svd.shape)
print(test_svd.shape)
train_svd.to_feather('../features/train/wordbatch_description_tsvd_train.feather')
test_svd.to_feather('../features/test/wordbatch_description_tsvd_test.feather')

Memory usage of dataframe is 80.29 MB
Memory usage after optimization is: 40.15 MB
Decreased by 50.0%
Memory usage of dataframe is 27.15 MB
Memory usage after optimization is: 13.58 MB
Decreased by 50.0%
(1503424, 7)
(508438, 7)


In [None]:
with open('./wordbatch_title_train.pickle', 'rb') as f:
    X_name_train = pickle.load(f)
with open('./wordbatch_title_test.pickle', 'rb') as f:
    X_name_test = pickle.load(f)
with open('./wordbatch_description_train.pickle', 'rb') as f:
    X_desc_train = pickle.load(f)
with open('./wordbatch_description_test.pickle', 'rb') as f:
    X_desc_test = pickle.load(f)

print(X_name_train.shape)
print(X_name_test.shape)
print(X_desc_train.shape)
print(X_desc_test.shape)

In [19]:
# Emsemble
dummy_cols = ['parent_category_name', 'category_name', 'user_type',
            'region', 'city']
df_train = pd.read_csv('../input/train.csv', usecols=dummy_cols)
df_test  = pd.read_csv('../input/test.csv' , usecols=dummy_cols)
y_train = pd.read_csv('../input/train.csv', usecols=['deal_probability'])

sparse_merge_train = hstack((X_name_train, X_desc_train)).tocsr()
sparse_merge_test = hstack((X_name_test, X_desc_test)).tocsr()
print(sparse_merge_train.shape)
for col in dummy_cols:
    print(col)
    lb = LabelBinarizer(sparse_output=True)
    sparse_merge_train = hstack((sparse_merge_train, lb.fit_transform(df_train[[col]].fillna('')))).tocsr()
    print(sparse_merge_train.shape)
    sparse_merge_test = hstack((sparse_merge_test, lb.transform(df_test[[col]].fillna('')))).tocsr()

del X_desc_test, X_name_test
del X_desc_train, X_name_train, lb, df_train, df_test
gc.collect()


(1503424, 1334097)
parent_category_name
(1503424, 1334106)
category_name
(1503424, 1334153)
user_type
(1503424, 1334156)
region
(1503424, 1334184)
city
(1503424, 1335917)


514

In [20]:
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(sparse_merge_train, y_train,
                                                              test_size = 0.5,
                                                              shuffle = False)

model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3.3)
model.fit(X_train_1, y_train_1)
train_ridge = model.predict(sparse_merge_train)
test_ridge = model.predict(sparse_merge_test)
print(rmse(model.predict(X_train_2), y_train_2))

model = Ridge(solver="sag", fit_intercept=True, random_state=4882, alpha=5)
model.fit(X_train_2, y_train_2)
train_ridge += model.predict(sparse_merge_train)
test_ridge += model.predict(sparse_merge_test)
print(rmse(model.predict(X_train_1), y_train_1))

train_ridge /= 2.0
test_ridge /= 2.0

deal_probability    0.234367
dtype: float64
deal_probability    0.232109
dtype: float64


In [21]:
train_ridgedf = pd.DataFrame()
train_ridgedf['wordbach_ensemble_ridge'] = train_ridge.flatten()
test_ridgedf = pd.DataFrame()
test_ridgedf['wordbach_ensemble_ridge'] = test_ridge.flatten()

print(train_ridgedf.shape)
print(test_ridgedf.shape)

train_ridgedf.head()

(1503424, 1)
(508438, 1)


Unnamed: 0,wordbach_ensemble_ridge
0,0.069136
1,0.210749
2,0.178658
3,0.422455
4,0.330381


In [22]:
train_ridgedf.reset_index(drop=True, inplace=True)
test_ridgedf.reset_index(drop=True, inplace=True)

train_ridgedf = reduce_mem_usage(train_ridgedf)
test_ridgedf = reduce_mem_usage(test_ridgedf)

print(train_ridgedf.shape)
print(test_ridgedf.shape)

train_ridgedf.to_feather('../features/train/wordbatch_ensemble_ridge_train.feather')
test_ridgedf.to_feather('../features/test/wordbatch_ensemble_ridge_test.feather')

Memory usage of dataframe is 11.47 MB
Memory usage after optimization is: 5.74 MB
Decreased by 50.0%
Memory usage of dataframe is 3.88 MB
Memory usage after optimization is: 1.94 MB
Decreased by 50.0%
(1503424, 1)
(508438, 1)


In [23]:
n_comp = 7
tsvd = TruncatedSVD(n_components=n_comp, algorithm='arpack')
tsvd.fit(sparse_merge_train)

train_svd = pd.DataFrame(tsvd.transform(sparse_merge_train))
test_svd = pd.DataFrame(tsvd.transform(sparse_merge_test))
train_svd.columns = ['svd_wordbatch_ensemble_'+str(i+1) for i in range(n_comp)]
test_svd.columns =  ['svd_wordbatch_ensemble_'+str(i+1) for i in range(n_comp)]

print(train_svd.shape)
print(test_svd.shape)
train_svd.head()


(1503424, 7)
(508438, 7)


Unnamed: 0,svd_wordbatch_ensemble_1,svd_wordbatch_ensemble_2,svd_wordbatch_ensemble_3,svd_wordbatch_ensemble_4,svd_wordbatch_ensemble_5,svd_wordbatch_ensemble_6,svd_wordbatch_ensemble_7
0,2.037161,-0.831009,-0.269284,0.22115,-0.265321,-0.072668,-0.088566
1,1.834982,-1.216466,-0.321711,0.11305,-0.007616,-0.010484,-0.269372
2,1.744941,-1.184826,-0.271241,0.082087,0.017441,0.027664,-0.302406
3,1.693178,-0.956834,-0.272879,0.111852,0.235998,0.017325,1.069096
4,0.552597,0.019953,0.086612,0.054564,-0.548707,-0.090681,-1.342931


In [24]:
train_svd.reset_index(drop=True, inplace=True)
test_svd.reset_index(drop=True, inplace=True)
train_svd = reduce_mem_usage(train_svd)
test_svd = reduce_mem_usage(test_svd)
print(train_svd.shape)
print(test_svd.shape)
train_svd.to_feather('../features/train/wordbatch_ensemble_tsvd_train.feather')
test_svd.to_feather('../features/test/wordbatch_ensemble_tsvd_test.feather')

Memory usage of dataframe is 80.29 MB
Memory usage after optimization is: 40.15 MB
Decreased by 50.0%
Memory usage of dataframe is 27.15 MB
Memory usage after optimization is: 13.58 MB
Decreased by 50.0%
(1503424, 7)
(508438, 7)
