In [226]:
import pandas as pandas
from os import walk
import pyarrow.parquet as parquet
from collections import defaultdict
import datetime

# Used to train document embeddings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Used to train the baseline model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as numpy

# Where the downloaded data are
input_path = '/Users/tyamgin/Projects/mlbootcamp/championship19/data'
# Where to store results
output_path = '/Users/tyamgin/Projects/mlbootcamp/championship19/res/'

In [227]:
%run cv.ipynb

In [2]:
# Get the test texts
test_texts = parquet.read_table(input_path + '/texts/textsTest/',
                                columns = ['objectId','preprocessed']).to_pandas()
train_texts = parquet.read_table(input_path + '/texts/textsTrain',
                                       columns = ['objectId','preprocessed']).to_pandas()
train_texts.head()

  labels, = index.labels


Unnamed: 0,objectId,preprocessed
0,11181946,"[питкерн, очен, интересн, растен, произраста, ..."
1,12040268,"[яхт, олигарх, проститутк, секс, охотниц, разо..."
2,14050867,"[гибнет, бо, подрыв, гранат, ког, ведут, бо, в..."
3,17023591,"[отношен, происход, выбор, святосла, райк, чит..."
4,18389833,[]


In [3]:
# Build document embeddings for text documents
doc2vec = Doc2Vec([TaggedDocument(lines,'tag') for lines in test_texts.preprocessed] + 
                  [TaggedDocument(lines,'tag') for lines in train_texts.preprocessed],
                vector_size=15, window=5, min_count=1, workers=8)
#doc2vec.save(output_path + 'doc2vec_all_15_5')

In [None]:
train_texts['embedding'] = train_texts.preprocessed.apply(doc2vec.infer_vector)
train_texts = train_texts[['objectId', 'embedding']]

In [74]:
test_texts['embedding'] = test_texts.preprocessed.apply(doc2vec.infer_vector)
test_texts = test_texts[['objectId', 'embedding']]

In [244]:
train_data = parquet.read_table(input_path + '/textsTrain').to_pandas()
train_data['label'] = train_data['feedback'].apply(lambda x: 1.0 if("Liked" in x) else 0.0).values
train_data.rename(columns = {'instanceId_objectId':'objectId'}, inplace = True)
train_data = train_data.join(train_texts.set_index('objectId'), how='inner', on = 'objectId')

In [296]:
##################

In [297]:
test_data = parquet.read_table(input_path + '/textsTest').to_pandas()
test_data.rename(columns = {'instanceId_objectId':'objectId'}, inplace = True)
test_data = test_data.join(test_texts.set_index('objectId'), how='inner', on='objectId')

  labels, = index.labels


In [300]:
def predict_to_submit(X, proba):
    X = X.assign(weight=-proba)
    scores = X[['instanceId_userId', 'objectId', 'weight']] \
            .groupby(['instanceId_userId','objectId']).mean()
    result = scores.sort_values(by=['instanceId_userId', 'weight']).reset_index()
    # Collect predictions for each user
    submit = result.groupby("instanceId_userId")['objectId'].apply(list)
    return submit

class SimpleModel:
    def get_X(self, data):
        X = pd.DataFrame({
            'objectType': data['objectType'],
            'clientType': data['clientType'],
        })
        emb = numpy.stack(data['embedding'])
        for j in range(a.shape[1]):
            X['emb%d' % j] = emb[:,j]
        return X
    def fit(self, data):
        X = self.get_X(data)
        y = data['label'].values
        self.model = LogisticRegression(random_state=123, solver='lbfgs').fit(X, y)
    def predict(self, X):
        proba_result = self.model.predict_proba(self.get_X(X))
        return proba_result[:, 1]

In [302]:
def create_features(data):
    data = data.copy()
    data['objectType'] = data['instanceId_objectType'].apply(lambda x: 0 if x == 'Post' else (1 if x == 'Video' else 2))
    data['clientType'] = data['audit_clientType'].apply(lambda x: 0 if x == 'WEB' else (1 if x == 'API' else 2))
    data.drop('instanceId_objectType', 1, inplace=True)
    data.drop('audit_clientType', 1, inplace=True)
    if 'feedback' in data.columns:
        data.drop('feedback', 1, inplace=True)
    data.drop('audit_timestamp', 1, inplace=True)
    data.drop('metadata_ownerId', 1, inplace=True)
    data.drop('metadata_createdAt', 1, inplace=True)
    data.drop('date', 1, inplace=True)
    return data
    
    

In [257]:
train = create_features(train_data)

In [293]:
#SimpleModel().get_X(train.head())
train.head()

Unnamed: 0,instanceId_userId,objectId,label,embedding,objectType,clientType
0,1618,25814780,0.0,"[0.043730143, -0.08875614, 0.19550756, 0.17694...",0,0
1,2122,10027037,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869...",0,1
1070,405739,10027037,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869...",0,2
1759,659725,10027037,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869...",0,0
4836,1748401,10027037,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869...",0,1


In [295]:
cross_validation(SimpleModel, train)

KFold(n_splits=5, random_state=2707, shuffle=True)
Prepare data: 2019-02-19 22:56:46.315975
Fit: 2019-02-19 22:57:01.173705
Predict: 2019-02-19 22:58:35.841880
Auc: 2019-02-19 22:58:44.387365
 0 - 1 : 0.5674, mean=0.5674
Prepare data: 2019-02-19 23:02:05.585259
Fit: 2019-02-19 23:02:19.806556
Predict: 2019-02-19 23:03:41.087712
Auc: 2019-02-19 23:03:49.994992
 0 - 2 : 0.5677, mean=0.5675
Prepare data: 2019-02-19 23:07:26.520079
Fit: 2019-02-19 23:07:40.851166


KeyboardInterrupt: 

In [303]:
test = create_features(test_data)

In [305]:
model = SimpleModel()
model.fit(train)
pred = model.predict(test)

In [306]:
submit = predict_to_submit(test, pred)
submit.to_csv(output_path + "/textSubmit1.csv.gz", header = False, compression='gzip')

In [None]:
lens_stat = defaultdict(int)
for r in submit:
    lens_stat[len(r)] += 1
lens_stat = pandas.DataFrame([(k, v) for k, v in lens_stat.items()], columns=['len', 'count'])
lens_stat['relative_count'] = lens_stat['count'] / lens_stat['count'].sum()
lens_stat.head()

In [307]:
submit.head()

instanceId_userId
316                                 [37758420, 17997084]
631                       [38118098, 30513650, 15478935]
742             [28816291, 34685448, 24302446, 10672856]
868    [35655697, 30143153, 11640701, 29650308, 30882...
979                                  [37950972, 7996257]
Name: objectId, dtype: object

In [308]:
train_data

Unnamed: 0,instanceId_userId,instanceId_objectType,objectId,feedback,audit_clientType,audit_timestamp,metadata_ownerId,metadata_createdAt,date,label,embedding
0,1618,Post,25814780,"[Clicked, Ignored]",WEB,1517458217938,81088,1517454825000,2018-02-01,0.0,"[0.043730143, -0.08875614, 0.19550756, 0.17694..."
1,2122,Post,10027037,[Ignored],API,1517488844356,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
1070,405739,Post,10027037,[Ignored],MOB,1517511247948,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
1759,659725,Post,10027037,[Ignored],WEB,1517466559543,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
4836,1748401,Post,10027037,[Ignored],API,1517511622831,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
6826,2404081,Post,10027037,[Ignored],API,1517487632489,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
8062,2796664,Post,10027037,[Clicked],WEB,1517452044806,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
12652,4257952,Post,10027037,[Ignored],API,1517433315078,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
18474,6000712,Post,10027037,[Liked],MOB,1517459635654,4016,1517423778000,2018-02-01,1.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
19276,6265702,Post,10027037,[Clicked],API,1517509974847,4016,1517423778000,2018-02-01,0.0,"[-0.2552362, 0.12500183, -0.0669179, 0.8778869..."
