# utils methods 

In [1]:
import numpy
import pandas
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from itertools import combinations

def load_problem_flight(large=False, convert_to_ints=False):
    '''
    Dataset used in common ML benchmarks: https://github.com/szilard/benchm-ml
    links to files:
    https://s3.amazonaws.com/benchm-ml--main/test.csv
    https://s3.amazonaws.com/benchm-ml--main/train-0.1m.csv
    https://s3.amazonaws.com/benchm-ml--main/train-1m.csv
    https://s3.amazonaws.com/benchm-ml--main/train-10m.csv
    '''
    if large:
        trainX = pandas.read_csv('data/flight/flight_train-10m.csv')
    else:
        trainX = pandas.read_csv('data/flight/flight_train-1m.csv')
    testX  = pandas.read_csv('data/flight/flight_test.csv')
    
    trainY = (trainX.dep_delayed_15min.values == 'Y') * 1
    testY  = (testX.dep_delayed_15min.values == 'Y') * 1
    
    trainX = trainX.drop('dep_delayed_15min', axis=1)
    testX  = testX.drop('dep_delayed_15min', axis=1)
    if convert_to_ints:
#         pdb.set_trace()
        categoricals = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest',]
        continous = ['Distance']
        
        trainX, testX = process_categorical_features(trainX, testX, columns=categoricals)
        trainX, testX = process_continuous_features(trainX, testX, columns=continous)

        trainX['DepTime'] = trainX['DepTime'] // 100
        testX['DepTime'] = testX['DepTime']   // 100
    
    return trainX, testX, trainY, testY

def load_problem_flight_extended(large=False):
    trainX, testX, trainY, testY = load_problem_flight(large=large, convert_to_ints=True)
    for column1, column2 in combinations(['UniqueCarrier', 'Origin', 'Dest', 'DepTime'], 2):
        new_column = column1 + '_' + column2
        trainX[new_column] = trainX[column1] * 10000 + trainX[column2]
        testX[new_column]  = testX[column1] * 10000 + testX[column2]
    trainX, testX = process_categorical_features(trainX, testX, columns=trainX.columns)
    return trainX, testX, trainY, testY    

def load_problem_movielens_100k(all_features=False):
    '''Standard test dataset for recommendation systems
    From http://grouplens.org/datasets/movielens/
    '''
    folder = 'data/ml-100k'
    ratings = pandas.read_csv(folder + '/u.data', sep='\t', 
                              names=['user', 'movie', 'rating', 'timestamp'], header=None)
    ratings = ratings.drop('timestamp', axis=1)
    if all_features:
        users   = pandas.read_csv(folder + '/u.user', sep='|', 
                                  names=['user', 'age', 'gender', 'occupation', 'zip'], header=None)
        movies  = pandas.read_csv(folder + '/u.item', sep='|',
           names=['movie', 'title','released','video_release', 'IMDb URL','unknown','Action','Adventure','Animation',
            'Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir',
            'Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'], header=None, encoding='latin-1')
        
        movies = movies.drop(['title', 'IMDb URL', 'video_release'], axis=1)
        movies['released'] = pandas.to_datetime(movies['released']).map(lambda z: z.year)
        ratings = pandas.merge(pandas.merge(ratings, users, on='user'), movies, on='movie')

    answers = ratings['rating'].values
    ratings = ratings.drop('rating', axis=1)

    for feature in ratings.columns:
        _, ratings[feature] = numpy.unique(ratings[feature], return_inverse=True)
        
    trainX, testX, trainY, testY = train_test_split(ratings, answers, test_size=0.25, random_state=42)
    return trainX, testX, trainY, testY


def load_problem_movielens_1m(all_features=False):
    '''
    Standard test dataset for recommendation systems
    From http://grouplens.org/datasets/movielens/
    '''
    folder = 'data/ml-1m'
    ratings = pandas.read_csv(folder + '/ratings.dat', sep='::', 
                              names=['user', 'movie', 'rating', 'timestamp'], header=None)
    ratings = ratings.drop('timestamp', axis=1)
#     pdb.set_trace()
    if all_features:
        users = pandas.read_csv(folder + '/users.dat', sep='::', 
                                names=['user', 'gender', 'age', 'occupation', 'zip'], header=None)
        movies = pandas.read_csv(folder + '/movies.dat', sep='::', names=['movie', 'title', 'genres'], header=None)
        sparse_genres = CountVectorizer().fit_transform(movies.genres.map(lambda x: x.replace('|', ' ')))
        sparse_genres = pandas.DataFrame(sparse_genres.todense())
        movies = pandas.concat([movies[['movie']], sparse_genres], axis=1)    
        ratings = pandas.merge(pandas.merge(ratings, users, on='user'), movies, on='movie')

    answers = ratings['rating'].values
    ratings = ratings.drop('rating', axis=1)

    for feature in ratings.columns:
        _, ratings[feature] = numpy.unique(ratings[feature], return_inverse=True)
        
    trainX, testX, trainY, testY = train_test_split(ratings, answers, test_size=0.25, random_state=42)
    return trainX, testX, trainY, testY


def preprocess_ad_problem():
    """
    Kaggle competition on CTR prediction: https://www.kaggle.com/c/avazu-ctr-prediction
    """
    av_train = pandas.read_csv('../data/ad_train.csv')
    for column in av_train.columns:
        if column != 'hour':        
            av_train[column] = numpy.unique(av_train[column], return_inverse=True)[1].astype('uint16')
            
    for column in av_train.columns:
        if numpy.max(av_train[column]) < 250:
            av_train[column] = av_train[column].astype('uint8')
        elif numpy.max(av_train[column]) < 65000:
            av_train[column] = av_train[column].astype('uint16')
        else:
            av_train[column] = av_train[column].astype('uint32')            
            
    av_train.to_hdf('../data/ad_updated_train.hdf5', 'data')
            
def load_problem_ad(train_size=1000000, test_size=10000000):
    """
    Kaggle competition on CTR prediction: https://www.kaggle.com/c/avazu-ctr-prediction
    First use preprocess ad.
    """
    data = pandas.read_hdf('../data/ad_updated_train.hdf5', 'data')
    data['day'] = (data['hour'] // 100) % 100
    data['hour'] = data['hour'] % 100
    answers = data['click'].values
    data = data.drop('click', axis=1)
    trainX, testX, trainY, testY = train_test_split(data, answers, train_size=train_size, test_size=test_size, random_state=42)
    return trainX, testX, trainY.astype('int'), testY.astype('int')

def remap(column, lookup):
    return (numpy.searchsorted(lookup, column) + 1) * numpy.in1d(column, lookup)

def process_categorical_features(trainX, testX, columns, copy=True):
    if copy:
        trainX = trainX.copy()
        testX = testX.copy()
    
    for column in columns:
        # hand-write labelencoding
        lookup = numpy.unique(trainX[column])
        trainX[column] = remap(trainX[column], lookup)
        testX[column] = remap(testX[column], lookup)
    
    return trainX, testX
        
def process_continuous_features(trainX, testX, columns, copy=True):
    if copy:
        trainX = trainX.copy()
        testX = testX.copy()
    
    for column in columns:
        # discretization by putting continues data into buckets which contain equal number of records
        percentiles = numpy.percentile(trainX[column], [10, 20, 30, 40, 50, 60, 70, 80, 90])
#         pdb.set_trace()
        trainX[column] = numpy.searchsorted(percentiles, trainX[column]).astype('uint8')
        testX[column]  = numpy.searchsorted(percentiles, testX[column]).astype('uint8')
    
    return trainX, testX







import numpy
import pandas
import pickle
from sklearn.metrics import roc_auc_score, mean_squared_error

from fastFM.mcmc import FMClassification, FMRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.datasets import dump_svmlight_file
from pyfm import pylibfm

import pdb

def fitpredict_logistic(trainX, trainY, testX, classification=True, **params):
#     pdb.set_trace()
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = LogisticRegression(**params)
        clf.fit(trainX, trainY)
        return clf.predict_proba(testX)[:, 1]
    else:
        clf = Ridge(**params)
        clf.fit(trainX, trainY)
        return clf.predict(testX)

def fitpredict_fastfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = FMClassification(rank=rank, n_iter=n_iter)
        return clf.fit_predict_proba(trainX, trainY, testX)
    else:
        clf = FMRegression(rank=rank, n_iter=n_iter)
        return clf.fit_predict(trainX, trainY, testX)  

def fitpredict_libfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    train_file = 'libfm_train.txt'
    test_file = 'libfm_test.txt'
    with open(train_file, 'wb') as f:
        dump_svmlight_file(trainX, trainY, f=f)
    with open(test_file, 'wb') as f:
        dump_svmlight_file(testX, numpy.zeros(testX.shape[0]), f=f)
    task = 'c' if classification else 'r'
    LIBFM_PATH = '/home/kai/data/resources/libfm/bin/libFM'
    console_output = !$LIBFM_PATH -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -out output.libfm
    
    libfm_pred = pandas.read_csv('output.libfm', header=None).values.flatten()
    return libfm_pred

def fitpredict_pylibfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    from sklearn.preprocessing import normalize    
    # you have to normalize first for pylibfm to work
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = normalize(encoder.transform(trainX))
    testX = normalize(encoder.transform(testX))
    task = 'classification' if classification else 'regression'
    fm = pylibfm.FM(num_factors=rank, num_iter=n_iter, verbose=False, task=task)
    if classification:
        fm.fit(trainX, trainY)
    else:
        fm.fit(trainX, trainY * 1.0) # convert int to float?
    return fm.predict(testX)






from collections import OrderedDict
import time

all_results = OrderedDict()
try:
    with open('./saved_results.pkl') as f:
        all_results = pickle.load(f)
except:
    pass

def test_on_dataset(trainX, testX, trainY, testY, task_name, classification=True, use_pylibfm=True):
#     pdb.set_trace()
    algorithms = OrderedDict()
    algorithms['logistic'] = fitpredict_logistic
    algorithms['libFM']    = fitpredict_libfm
    algorithms['fastFM']   = fitpredict_fastfm
    if use_pylibfm:
        algorithms['pylibfm']  = fitpredict_pylibfm
    
    results = pandas.DataFrame()
    from tqdm import tqdm
    for name, fit_predict in tqdm(algorithms.items()):
        start = time.time()
        predictions = fit_predict(trainX, trainY, testX, classification=classification)
        spent_time = time.time() - start
        results.loc[name, 'time'] = spent_time
        if classification:
            results.loc[name, 'ROC AUC'] = roc_auc_score(testY, predictions)
        else:
            results.loc[name, 'RMSE'] = numpy.mean((testY - predictions) ** 2) ** 0.5
            
    all_results[task_name] = results
#     with open('saved_results.pkl', 'w') as f:
#         pickle.dump(all_results, f)
        
    return results

# data with only 'user' and 'movie'

In [3]:
trainX, testX, trainY, testY = load_problem_movielens_100k(all_features=False)
trainX.shape

(75000, 2)

In [4]:
trainX.head()

Unnamed: 0,user,movie
98980,810,900
69824,803,754
9928,51,286
75599,734,180
95621,896,95


In [5]:
test1res = test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k, ids', classification=False)

100%|██████████| 4/4 [01:22<00:00, 20.72s/it]


In [6]:
test1res

Unnamed: 0,time,RMSE
logistic,0.545414,0.942665
libFM,1.615051,0.940752
fastFM,1.913722,0.915184
pylibfm,78.811907,0.925223


# data with more features

In [7]:
trainX, testX, trainY, testY = load_problem_movielens_100k(all_features=True)

In [8]:
trainX.shape

(75000, 26)

In [9]:
trainX.head(2)

Unnamed: 0,user,movie,age,gender,occupation,zip,released,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
98980,692,1310,33,0,7,615,68,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69824,931,528,48,1,3,59,57,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
test2res = test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k', classification=False)

100%|██████████| 4/4 [05:58<00:00, 89.54s/it]


In [12]:
test2res

Unnamed: 0,time,RMSE
logistic,2.190086,0.942356
libFM,11.497473,0.941219
fastFM,20.73458,0.896543
pylibfm,323.732238,0.935298


# Example to show how to handle categorical data

In [13]:
trainX, testX, trainY, testY = load_problem_flight()

In [14]:
np.unique(trainY)

array([0, 1])

In [15]:
trainX.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-4,c-26,c-2,1828,XE,LEX,IAH,828
1,c-12,c-11,c-1,1212,UA,DEN,MCI,533
2,c-10,c-1,c-6,935,OH,HSV,CVG,325
3,c-11,c-26,c-6,930,OH,JFK,PNS,1028
4,c-12,c-6,c-2,1350,MQ,DFW,LBB,282


In [16]:
trainX, testX, trainY, testY = load_problem_flight(convert_to_ints=True)

In [18]:
trainX.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,7,19,2,18,21,157,133,6
1,4,3,1,12,18,79,172,4
2,2,1,6,9,15,129,72,2
3,3,19,6,9,15,147,220,7
4,4,28,2,13,13,80,155,2


In [4]:
trainX.shape

(1000000, 8)

In [None]:
test3res = test_on_dataset(trainX, testX, trainY, testY, task_name='flight', classification=True, use_pylibfm=False)

In [7]:
test3res

Unnamed: 0,time,ROC AUC
logistic,16.293235,0.724488
libFM,56.644698,0.724223
fastFM,235.460533,0.733395


# more data (increased from 100k to 1m)

In [None]:
trainX, testX, trainY, testY = load_problem_movielens_1m(all_features=True)

In [10]:
trainX.shape

(750156, 26)

In [11]:
trainX.head(1)

Unnamed: 0,user,movie,gender,age,occupation,zip,0,1,2,3,...,10,11,12,13,14,15,16,17,18,19
610738,5245,2240,0,1,0,2168,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
np.unique(trainY)

array([1, 2, 3, 4, 5])

In [152]:
test4res = test_on_dataset(trainX, testX, trainY, testY, task_name='ml-1m,ids', classification=False)

In [153]:
test4res

Unnamed: 0,time,RMSE
logistic,13.917847,0.910699
libFM,16.438905,0.910286
fastFM,41.518702,0.858305
pylibfm,89.339065,0.872631


# Example to show how to run cmd in jupyter notebook

In [132]:
!$LIBFM_PATH -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank'# -out output.libfm

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=750156	num_values=1500312	num_features=9716	min_target=1	max_target=5
Loading test... 	
has x = 0
has xt = 1
num_rows=250053	num_values=500072	num_features=9716	min_target=0	max_target=0
#relations: 0
Loading meta data...	
#Iter=  0	Train=1	Test=0	Test(ll)=0.799919
#Iter=  1	Train=1	Test=0	Test(ll)=0.89026
#Iter=  2	Train=1	Test=0	Test(ll)=0.962494
#Iter=  3	Train=1	Test=0	Test(ll)=1.02284
#Iter=  4	Train=1	Test=0	Test(ll)=1.07468
#Iter=  5	Train=1	Test=0	Test(ll)=1.12
#Iter=  6	Train=1	Test=0	Test

# Apply to Talkingdata (original data)

In [None]:
import pandas as pd
import gc

input_dir = '/home/kai/data/shiyi/Kaggle/talkingdata/sc/TalkingData_4th_solution/input'

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
#nrows=1000000
nrows=None
train_df = pd.read_csv(input_dir+"/train.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'], nrows=nrows)
# test_df = pd.read_csv(input_dir+"/test_supplement.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'], nrows=nrows)

In [14]:
train_df.shape, test_df.shape

((184903890, 7), (57537505, 7))

In [None]:
#train = train_df.head(100000).copy()

In [15]:
val_size = 5000000
train = train_df.iloc[:-val_size].copy()
val = train_df.iloc[-val_size:].copy()
train.shape, val.shape

((179903890, 7), (5000000, 7))

In [16]:
pos = train[train.is_attributed == 1]

In [17]:
pos.shape

(447579, 7)

In [18]:
neg = train[train.is_attributed == 0]

In [19]:
neg.shape

(179456311, 7)

In [20]:
neg_sample = neg.sample(n=len(pos))

In [21]:
neg_sample.shape

(447579, 7)

In [22]:
train_balanced = pd.concat([pos, neg_sample])

In [23]:
train_balanced.shape

(895158, 7)

In [17]:
trainY = train_balanced.is_attributed.copy().astype('int8')
trainY[trainY==0] = -1 # change 0 to 1 so that fastfm can train 

testY = val.is_attributed.copy().astype('int8')
testY[testY==0] = -1

In [39]:
train_balanced.drop(['is_attributed','ip','click_time'], axis=1, inplace=True)
val.drop(['is_attributed','ip','click_time'], axis=1, inplace=True)

In [40]:
trainX = train_balanced
testX = val

In [41]:
trainX.shape, trainY.shape, testX.shape, testY.shape

((895158, 4), (895158,), (5000000, 4), (5000000,))

In [22]:
log_preds = fitpredict_logistic(trainX, trainY, testX)

In [23]:
roc_auc_score(testY, log_preds)

0.97475876740852552

In [24]:
libfm_preds = fitpredict_libfm(trainX, trainY, testX)

In [25]:
roc_auc_score(testY, libfm_preds)

0.97406689558211368

In [26]:
fastfm_preds = fitpredict_fastfm(trainX, trainY, testX)

In [27]:
roc_auc_score(testY, fastfm_preds)

0.97702264848048137

In [42]:
talkingdata_res = test_on_dataset(trainX, testX, trainY, testY, task_name='talking', classification=True, use_pylibfm=False)

100%|██████████| 3/3 [09:38<00:00, 192.79s/it]


In [43]:
talkingdata_res

Unnamed: 0,time,ROC AUC
logistic,11.115302,0.974666
libFM,324.737748,0.973991
fastFM,239.89073,0.976752


In [46]:
trainX.columns

Index(['app', 'device', 'os', 'channel'], dtype='object')

In [57]:
del train_df, test_df, trainX, testX; gc.collect()

304

# Apply to Talkingdata (feature engineered data)

In [2]:
import pandas as pd
import gc
import numpy as np

In [3]:
train = pd.read_feather('../Feathers/ss_alltrain_typechanged.ftr')
print('loaded train')
train.set_index('index', inplace=True)
del train.index.name

train.drop(['matrixFact_user_ip_item_appdeviceos','matrixFact_user_ipchannel_item_appdeviceos','matrixFact_user_iposdeviceapp_item_app'], axis=1, inplace=True)

loaded train


In [4]:
embedding_features = ['app','channel','device','os','hour']

numeric_features = list(set(set(train.columns) - set(embedding_features)) - set(['is_attributed']))
print(numeric_features, len(numeric_features))

['app_device_os_mean', 'attributed_timediff', 'ip_device_os_count', 'ip_app_device_countfromfuture', 'ip_device_os_countfrompast', 'ip_app_device_countfrompast', 'ip_device_os_time2nextclick', 'ip_app_device_os_time2nextclick', 'ip_device_os_mean', 'ip_app_device_os_time2previousclick', 'ip_app_device_lasttimediff', 'ip_app_device_os_countfromfuture', 'ip_app_device_time2nextclick', 'ip_app_os_day_hour_count', 'ip_device_os_countfromfuture', 'app_day_hour_count', 'ip_day_hour_count', 'ip_app_device_mean', 'ip_app_device_os_countfrompast', 'ip_device_os_lasttimediff', 'ip_app_device_os_firsttimediff', 'ip_device_os_time2previousclick', 'ip_app_device_os_count', 'ip_device_os_firsttimediff', 'ip_app_day_hour_count', 'ip_app_device_time2previousclick', 'ip_app_device_os_lasttimediff', 'ip_os_day_hour_count', 'ip_app_device_firsttimediff', 'ip_app_device_os_mean'] 30


In [None]:
# test = pd.read_feather('Feathers/ss_test_via_alltrain_typechanged.ftr')
# print('loaded test')
# test.set_index('index', inplace=True)
# del test.index.name

# test.drop(['matrixFact_user_ip_item_appdeviceos','matrixFact_user_ipchannel_item_appdeviceos','matrixFact_user_iposdeviceapp_item_app'], axis=1, inplace=True)

In [5]:
from tqdm import tqdm
bins = np.linspace(2,98,49)
print(bins)

[  2.   4.   6.   8.  10.  12.  14.  16.  18.  20.  22.  24.  26.  28.  30.
  32.  34.  36.  38.  40.  42.  44.  46.  48.  50.  52.  54.  56.  58.  60.
  62.  64.  66.  68.  70.  72.  74.  76.  78.  80.  82.  84.  86.  88.  90.
  92.  94.  96.  98.]


In [6]:
# save to feather file if this takes very long
for column in tqdm(numeric_features):
    percentiles = np.percentile(train[column], bins)
    train[column] = np.searchsorted(percentiles, train[column]).astype('uint8')
    #test[column]  = np.searchsorted(percentiles, test[column]).astype('uint8')

100%|██████████| 30/30 [07:37<00:00, 15.24s/it]


In [7]:
val_size = 5000000
val = train[-val_size:].copy()

In [9]:
pos = train[:-val_size][train.is_attributed == 1].copy()
print(pos.shape)

  """Entry point for launching an IPython kernel.


(447579, 36)


In [10]:
# no need to sample from all_neg because 99.75% are negative in train
# all_neg = train[train.is_attributed == 0]
# print(all_neg.shape)
neg = train.sample(n=len(pos)).copy() 
print(neg.shape)

(447579, 36)


In [11]:
train_balanced = pd.concat([pos, neg])
print(train_balanced.shape)

(895158, 36)


In [13]:
del train; gc.collect()

825

In [None]:
trainY = train_balanced.is_attributed.copy().astype('int8')
trainY[trainY==0] = -1 # change 0 to 1 so that fastfm can train 

In [17]:
valY = val.is_attributed.copy().astype('int8')
valY[valY==0] = -1

In [19]:
train_balanced.drop(['is_attributed'], axis=1, inplace=True)
val.drop(['is_attributed'], axis=1, inplace=True)

In [20]:
trainX = train_balanced
valX = val
trainX.shape, trainY.shape, valX.shape, valY.shape

((895158, 35), (895158,), (5000000, 35), (5000000,))

In [21]:
log_preds = fitpredict_logistic(trainX, trainY, valX)

In [22]:
roc_auc_score(valY, log_preds)

0.98655242515871389

In [23]:
talkingdata_res2 = test_on_dataset(trainX, valX, trainY, valY, task_name='talking', classification=True, use_pylibfm=False)

100%|██████████| 3/3 [53:34<00:00, 1071.44s/it]


In [24]:
talkingdata_res2

Unnamed: 0,time,ROC AUC
logistic,107.87693,0.986552
libFM,1001.872105,0.985458
fastFM,2099.812043,0.984177
