In [1]:
import pandas as pd
import numpy as np
from scipy import sparse as ssp
from sklearn.model_selection import KFold
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
from sklearn.utils import resample, shuffle
from sklearn.preprocessing import MinMaxScaler
seed=1024
np.random.seed(seed)
path = "../../kaggle-quora/data/"
train = pd.read_csv(path+"train_porter.csv")

In [2]:
# tfidf
train_question1_tfidf = pd.read_pickle(path+'train_question1_tfidf.pkl')[:]
test_question1_tfidf = pd.read_pickle(path+'test_question1_tfidf.pkl')[:]

train_question2_tfidf = pd.read_pickle(path+'train_question2_tfidf.pkl')[:]
test_question2_tfidf = pd.read_pickle(path+'test_question2_tfidf.pkl')[:]


train_question1_porter_tfidf = pd.read_pickle(path+'train_question1_porter_tfidf.pkl')[:]
test_question1_porter_tfidf = pd.read_pickle(path+'test_question1_porter_tfidf.pkl')[:]

train_question2_porter_tfidf = pd.read_pickle(path+'train_question2_porter_tfidf.pkl')[:]
test_question2_porter_tfidf = pd.read_pickle(path+'test_question2_porter_tfidf.pkl')[:]

In [3]:
# interaction
train_interaction = pd.read_pickle(path+'train_interaction.pkl')[:].reshape(-1,1)
test_interaction = pd.read_pickle(path+'test_interaction.pkl')[:].reshape(-1,1)

train_porter_interaction = pd.read_pickle(path+'train_porter_interaction.pkl')[:].reshape(-1,1)
test_porter_interaction = pd.read_pickle(path+'test_porter_interaction.pkl')[:].reshape(-1,1)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  """
  


In [4]:
# jaccard distance
train_jaccard = pd.read_pickle(path+'train_jaccard.pkl')[:].reshape(-1,1)
test_jaccard = pd.read_pickle(path+'test_jaccard.pkl')[:].reshape(-1,1)

train_porter_jaccard = pd.read_pickle(path+'train_porter_jaccard.pkl')[:].reshape(-1,1)
test_porter_jaccard = pd.read_pickle(path+'test_porter_jaccard.pkl')[:].reshape(-1,1)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  """
  


In [5]:
# len
train_len = pd.read_pickle(path+"train_len.pkl")
test_len = pd.read_pickle(path+"test_len.pkl")
scaler = MinMaxScaler()
scaler.fit(np.vstack([train_len, test_len]))
train_len = scaler.transform(train_len)
test_len =scaler.transform(test_len)

In [6]:
# jaccard + magic
train_jac_magic = pd.read_pickle(path+'train_jaccard_magic_features.pkl').as_matrix()
test_jac_magic = pd.read_pickle(path+'test_jaccard_magic_features.pkl').as_matrix()
scaler = MinMaxScaler()
scaler.fit(np.vstack([train_jac_magic, test_jac_magic]))
train_jac_magic = scaler.transform(train_jac_magic)
test_jac_magic =scaler.transform(test_jac_magic)

In [7]:
%%time

X = ssp.hstack([
    train_question1_tfidf,
    train_question2_tfidf,
    train_interaction,
    train_porter_interaction,
    train_jaccard,
    train_porter_jaccard,
    train_len,
    train_jac_magic,
    ]).tocsr()


y = train['is_duplicate'].values[:]

X_t = ssp.hstack([
    test_question1_tfidf,
    test_question2_tfidf,
    test_interaction,
    test_porter_interaction,
    test_jaccard,
    test_porter_jaccard,
    test_len,
    test_jac_magic,
    ]).tocsr()


print(X.shape)
print(X_t.shape)

(404290, 3073589)
(2345796, 3073589)
CPU times: user 1min 10s, sys: 21.5 s, total: 1min 32s
Wall time: 1min 39s


In [8]:
print(type(X))
print(type(y))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>


In [10]:
ssp.save_npz('./x_test.npz', ssp.csr_matrix(X_t))

In [10]:
def oversample(X_ot, y, p=0.165):
    pos_ot = X_ot[y==1]
    neg_ot = X_ot[y==0]
    scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1
    
    while scale > 1:
        neg_ot = ssp.vstack([neg_ot, neg_ot]).tocsr()
        scale -=1

    neg_ot = ssp.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]]).tocsr()
    ot = ssp.vstack([pos_ot, neg_ot]).tocsr()
    y=np.zeros(ot.shape[0])
    y[:pos_ot.shape[0]]=1.0
    print(y.mean())
    
    return ot, y

In [11]:
X_oversample, y_oversample = oversample(X, y, p=0.165)

0.191243661001


In [12]:
fold1_index = pd.read_csv('../index/fold1_index.csv').values.flatten()
fold2_index = pd.read_csv('../index/fold2_index.csv').values.flatten()
validation_index = pd.read_csv('../index/validation_index.csv').values.flatten()

In [13]:
train_fold1 = X_oversample[fold1_index]
train_fold2 = X_oversample[fold2_index]
validation_fold = X_oversample[validation_index]

y_train_fold1 = y_oversample[fold1_index]
y_train_fold2 = y_oversample[fold2_index]
y_validation_fold = y_oversample[validation_index]

In [14]:
%%time
########################################
### Saving sparse matrix for py2 FM ###
########################################

print(type(train_fold1))
print(type(train_fold2))
print(type(ssp.csr_matrix(y_train_fold1)))
print(type(ssp.csr_matrix(y_train_fold2)))

ssp.save_npz('./x_train_fold1.npz', train_fold1)
ssp.save_npz('./x_train_fold2.npz', train_fold2)
ssp.save_npz('./x_validation_fold.npz', validation_fold)

ssp.save_npz('./y_train_fold1.npz', ssp.csr_matrix(y_train_fold1))
ssp.save_npz('./y_train_fold2.npz', ssp.csr_matrix(y_train_fold2))
ssp.save_npz('./y_validation_fold.npz', ssp.csr_matrix(y_validation_fold))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
CPU times: user 23.2 s, sys: 1.68 s, total: 24.9 s
Wall time: 25.4 s


In [15]:
del train_fold1, train_fold2, validation_fold, y_train_fold1, y_train_fold2, y_validation_fold
del X_oversample, y_oversample

# Load Data Directly Instead of Processing Again.

In [15]:
##########################
### Factorized Machine ###
##########################

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import dump_svmlight_file

train_fold1 = sparse.load_npz("../../kaggle-quora/fm/x_train_fold1.npz")
train_fold2 = sparse.load_npz("../../kaggle-quora/fm/x_train_fold2.npz")
validation_fold = sparse.load_npz("../../kaggle-quora/fm/x_validation_fold.npz")

y_train_fold1 = sparse.load_npz("../../kaggle-quora/fm/y_train_fold1.npz")
y_train_fold2 = sparse.load_npz("../../kaggle-quora/fm/y_train_fold2.npz")
y_validation_fold = sparse.load_npz("../../kaggle-quora/fm/y_validation_fold.npz")

x_test = sparse.load_npz('../../kaggle-quora/fm/x_test.npz')

x_train = sparse.vstack((train_fold1, train_fold2, validation_fold), format='csr')
y_train = sparse.hstack((y_train_fold1, y_train_fold2, y_validation_fold), format='csr').T.toarray().flatten()

In [24]:
# Generate 5 folds svm data for training...

skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
folds = 1
for train_index, val_index in skf.split(x_train, y_train):
    print('\nFold: ', folds)
    
    x_train_fold = x_train[train_index]
    y_train_fold = y_train[train_index]
    print("Train: ", x_train_fold.shape, y_train_fold.shape)
    
    x_val_fold = x_train[val_index]
    y_val_fold = y_train[val_index]
    print("Val: ", x_val_fold.shape, y_val_fold.shape)
    
    dump_svmlight_file(x_train_fold, y_train_fold, "./train_tfidf_jacad_magic_fold_{}.svm".format(folds))
    dump_svmlight_file(x_val_fold, y_val_fold, "./val_tfidf_jacad_magic_fold_{}.svm".format(folds))
    folds = folds + 1


Fold:  1
Train:  (624388, 3073589) (624388,)
Val:  (156098, 3073589) (156098,)

Fold:  2
Train:  (624388, 3073589) (624388,)
Val:  (156098, 3073589) (156098,)

Fold:  3
Train:  (624388, 3073589) (624388,)
Val:  (156098, 3073589) (156098,)

Fold:  4
Train:  (624390, 3073589) (624390,)
Val:  (156096, 3073589) (156096,)

Fold:  5
Train:  (624390, 3073589) (624390,)
Val:  (156096, 3073589) (156096,)


In [61]:
# skf = KFold(n_splits=5, shuffle=True, random_state=seed).split(X)
# for ind_tr, ind_te in skf:
#     X_train = X[ind_tr]
#     X_test = X[ind_te]

#     y_train = y[ind_tr]
#     y_test = y[ind_te]
#     break

dump_svmlight_file(X, y, path+"X_tfidf_jacad_magic.svm")
del X
dump_svmlight_file(X_t, np.zeros(X_t.shape[0]), path + "X_t_tfidf_jacad_magic.svm")
del X_t

In [63]:
# %%time

# X_train, y_train = oversample(X_train.tocsr(), y_train, p=0.165)
# X_test, y_test = oversample(X_test.tocsr(), y_test, p=0.165)

# X_train, y_train = shuffle(X_train, y_train, random_state=seed)

# dump_svmlight_file(X_train, y_train, path + "X_train_tfidf_jacad_magic.svm")
# dump_svmlight_file(X_test, y_test, path + "X_test_tfidf_jacad_magic.svm")

0.191269277687
0.191144081052
CPU times: user 3min 17s, sys: 8.93 s, total: 3min 26s
Wall time: 3min 32s
