In [22]:
import pandas as pd
from os import walk
import pyarrow.parquet as parquet
from collections import defaultdict
import datetime

# Used to train document embeddings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Used to train the baseline model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np

# Where the downloaded data are
input_path = '/Users/tyamgin/Projects/mlbootcamp/championship19/data'
# Where to store results
output_path = '/Users/tyamgin/Projects/mlbootcamp/championship19/res'

In [126]:
%run cv.ipynb

0


In [3]:
def create_text_features(texts, doc2vec=None):
    result = pd.DataFrame({
        'objectId': texts.objectId,
        'lang': texts.lang,
        'len': texts.text.apply(len),
        'p_len': texts.preprocessed.apply(len),
        'q_count': texts.text.apply(lambda s: s.count('?')),
        'links_count': texts.text.apply(lambda s: s.count('http')),
    })
    if doc2vec:
        result['embedding'] = texts.preprocessed.apply(doc2vec.infer_vector)
    return result

In [4]:
# Get the test texts
test_texts = parquet.read_table(input_path + '/texts/textsTest/').to_pandas()
train_texts = parquet.read_table(input_path + '/texts/textsTrain').to_pandas()
train_texts.head()

  labels, = index.labels


Unnamed: 0,objectId,lang,text,preprocessed
0,11181946,ru,"Питкерния\r\n\r\nОчень интересное растение, пр...","[питкерн, очен, интересн, растен, произраста, ..."
1,12040268,Unknown,"Яхты, олигархи, проститутки: секс-охотница раз...","[яхт, олигарх, проститутк, секс, охотниц, разо..."
2,14050867,ru,"Кто-то гибнет в бою, подрывая себя гранатой, а...","[гибнет, бо, подрыв, гранат, ког, ведут, бо, в..."
3,17023591,ru,Отношения: когда происходит выбор? Святослав Р...,"[отношен, происход, выбор, святосла, райк, чит..."
4,18389833,Unknown,ok.ru/group/51094392012955 ok.ru/giflive ok.ru...,[]


In [5]:
# Build document embeddings for text documents
#doc2vec = Doc2Vec([TaggedDocument(lines,'tag') for lines in test_texts.preprocessed] + 
#                  [TaggedDocument(lines,'tag') for lines in train_texts.preprocessed],
#                vector_size=15, window=5, min_count=1, workers=8)
#doc2vec.save(output_path + '/doc2vec_all_15_5')
doc2vec = Doc2Vec.load(output_path + '/doc2vec_all_15_5')

In [6]:
train_texts = create_text_features(train_texts, doc2vec)

In [7]:
test_texts = create_text_features(test_texts, doc2vec)

In [8]:
train_data = parquet.read_table(input_path + '/textsTrain').to_pandas()
train_data['label'] = train_data['feedback'].apply(lambda x: 1.0 if("Liked" in x) else 0.0).values
train_data.rename(columns = {'instanceId_objectId':'objectId'}, inplace = True)
train_data = train_data.join(train_texts.set_index('objectId'), how='inner', on = 'objectId')

  labels, = index.labels


In [9]:
##################

In [10]:
test_data = parquet.read_table(input_path + '/textsTest').to_pandas()
test_data.rename(columns = {'instanceId_objectId':'objectId'}, inplace = True)
test_data = test_data.join(test_texts.set_index('objectId'), how='inner', on='objectId')

In [139]:
def predict_to_submit(X, proba):
    X = X.assign(weight=-proba)
    scores = X[['instanceId_userId', 'objectId', 'weight']] \
            .groupby(['instanceId_userId','objectId']).mean()
    result = scores.sort_values(by=['instanceId_userId', 'weight']).reset_index()
    # Collect predictions for each user
    submit = result.groupby("instanceId_userId")['objectId'].apply(list)
    return submit

class SimpleModel(MyModel):
    def get_X(self, data):
        X = pd.DataFrame({
            'objectType': data['objectType'],
            'clientType': data['clientType'],
            'len': data['len'],
            'p_len': data['p_len'],
            'q_count': data['q_count'],
            'links_count': data['links_count'],
        })
        emb = np.stack(data['embedding'])
        for j in range(emb.shape[1]):
            X['emb%d' % j] = emb[:,j]
        return X
    def fit(self, data):
        X = self.get_X(data)
        y = data['label'].values
        #self.model = LogisticRegression(random_state=123, solver='lbfgs').fit(X, y)
        self.model = MLPClassifier(alpha=.0002, hidden_layer_sizes=(25,), max_iter=150,
                                   learning_rate_init=0.001, random_state=322).fit(X, y)
    def predict(self, X):
        proba_result = self.model.predict_proba(self.get_X(X))
        return proba_result[:, 1]
    
class LgbModel(SimpleModel):
    def __init__(self, params):
        self.params = params
    def get_X(self, data):
        X = data.drop(['objectId', 'instanceId_userId'], 1)
        if 'label' in data.columns:
            X.drop('label', 1, inplace=True)
        return X
    def fit(self, data):
        lgb_train = lgb.Dataset(self.get_X(data), data['label'].values)
        #if self.verbose >= 2:
        print('Starting train: %s' % datetime.datetime.now())
        params = self.params.copy()
        num_boost_round = params['num_boost_round']
        del params['num_boost_round']
        self.model = lgb.train(
            params,
            lgb_train,
            num_boost_round=num_boost_round
        )
    def predict(self, X):
        proba = self.model.predict(self.get_X(X))
        return proba


In [103]:
def feat(x):
    n = x.shape[0]
    return np.add.reduce(x.embedding) / n

def create_features(data): #TODO: https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict
    res = pd.DataFrame({
        'instanceId_userId': data['instanceId_userId'],
        'objectId': data['objectId'],
        'objectType': data['instanceId_objectType'].apply(lambda x: 0 if x == 'Post' else (1 if x == 'Video' else 2)),
        'clientType': data['audit_clientType'].apply(lambda x: 0 if x == 'WEB' else (1 if x == 'API' else 2)),
        'len': data['len'],
        'p_len': data['p_len'],
        'q_count': data['q_count'],
        'links_count': data['links_count'],
    })
    data.groupby('instanceId_userId').apply(lambda y:)
    
    emb = np.stack(data['embedding'])
    for j in range(emb.shape[1]):
        res['emb%d' % j] = emb[:,j]
    if 'label' in data.columns:
        res['label'] = data['label'],
    return res

In [62]:
train = create_features(train_data)

In [45]:
train.to_pickle(output_path + '/train.pkl')
train.head(20) # TODO: reduce duplicates

Unnamed: 0,objectType,clientType,emb0,emb1,emb2,emb3,emb4,emb5,emb6,emb7,emb8,emb9,emb10,emb11,emb12,emb13,emb14
0,0,0,0.078651,-0.003994,0.059786,0.164362,0.005707,-0.103447,0.130611,-0.0922,0.320624,-0.163396,0.135197,-0.000836,-0.047103,0.057426,0.211267
1,0,1,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
1070,0,2,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
1759,0,0,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
4836,0,1,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
6826,0,1,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
8062,0,0,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
12652,0,1,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
18474,0,2,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258
19276,0,1,-0.114796,0.344675,-0.161069,0.974328,-0.445904,-0.514079,1.138814,-0.121215,0.332377,-0.129256,1.577412,-0.398271,-1.567348,-0.52069,-0.076258


In [None]:
%%time
for learning_rate in (0.10,):
    for feature_fraction in (0.9,):
        for num_boost_round in (400,):
            print((learning_rate,feature_fraction,num_boost_round))
            cross_validation(LgbModel({
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'binary_logloss',
                'num_leaves': 15,
                'learning_rate': learning_rate,
                'feature_fraction': feature_fraction,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'num_boost_round': num_boost_round,
                'verbose': 0
            }), train, n_iters=1, verbose=2)

(0.1, 0.9, 400)
KFold(n_splits=5, random_state=2707, shuffle=True)
Prepare data: 2019-02-22 12:54:54.219526
Fit: 2019-02-22 12:55:12.573975
Starting train: 2019-02-22 12:55:21.332816
Predict: 2019-02-22 12:59:07.596016
Auc: 2019-02-22 12:59:27.120225
 0 - 1 : 0.6183, mean=0.6183
Prepare data: 2019-02-22 12:59:28.194641
Fit: 2019-02-22 12:59:51.955540
Starting train: 2019-02-22 12:59:58.660838
Predict: 2019-02-22 13:03:43.716771
Auc: 2019-02-22 13:04:00.851363
 0 - 2 : 0.6195, mean=0.6189
Prepare data: 2019-02-22 13:04:01.894542
Fit: 2019-02-22 13:04:26.172511
Starting train: 2019-02-22 13:04:32.917192
Predict: 2019-02-22 13:08:19.678872
Auc: 2019-02-22 13:08:39.566969
 0 - 3 : 0.6193, mean=0.6190
Prepare data: 2019-02-22 13:08:40.680622


In [107]:
test = create_features(test_data)

In [108]:
#model = LgbModel()
#model.fit(train)
pred = model.predict(test)

In [109]:
submit = predict_to_submit(test, pred)
submit.to_csv(output_path + "/textSubmit1.csv.gz", header = False, compression='gzip')

In [116]:
lens_stat = defaultdict(int)
for r in submit:
    lens_stat[len(r)] += 1
lens_stat = pd.DataFrame([(k, v) for k, v in lens_stat.items()], columns=['len', 'count'])
lens_stat['relative_count'] = lens_stat['count'] / lens_stat['count'].sum()
lens_stat.head()

Unnamed: 0,len,count,relative_count
0,2,66422,0.318806
1,3,43664,0.209574
2,4,27742,0.133154
3,8,6431,0.030867
4,16,913,0.004382


In [307]:
submit.head()

instanceId_userId
316                                 [37758420, 17997084]
631                       [38118098, 30513650, 15478935]
742             [28816291, 34685448, 24302446, 10672856]
868    [35655697, 30143153, 11640701, 29650308, 30882...
979                                  [37950972, 7996257]
Name: objectId, dtype: object

In [29]:
st = train[train.instanceId_userId < 100000]
cross_validation(LgbModel, st, n_iters=1)

KFold(n_splits=5, random_state=2707, shuffle=True)
[0.09032184 0.05449515 0.05449515 0.05449515 0.11524286]
 0 - 1 : 0.6101, mean=0.6101
[0.08773912 0.08885853 0.08787986 0.08885853 0.08787986]
 0 - 2 : 0.6234, mean=0.6167
[0.18378015 0.13601413 0.19162637 0.20648389 0.19926677]
 0 - 3 : 0.6342, mean=0.6226
[0.09090399 0.08185244 0.10167038 0.07653604 0.10629639]
 0 - 4 : 0.6266, mean=0.6236
[0.1445142  0.15797199 0.05879618 0.12857328 0.06304604]
 0 - 5 : 0.6053, mean=0.6199


In [89]:
del q_group
q = train[['instanceId_userId', 'label']].groupby('instanceId_userId').agg({
    'label': ['mean', 'count']
})

In [98]:
np.intersect1d(train.instanceId_userId.unique(), test.instanceId_userId.unique()).shape

(198778,)

In [99]:
train.instanceId_userId.unique().shape

(3902235,)

In [100]:
test.instanceId_userId.unique().shape

(208346,)

In [113]:
train.emb1.mean()

-0.09304651

In [114]:
q.head()

Unnamed: 0_level_0,label,label
Unnamed: 0_level_1,mean,count
instanceId_userId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,0.0,3
7,0.666667,3
13,0.0,7
16,0.0,1
19,0.0,1


In [121]:
q['label'].groupby('count').count()

Unnamed: 0_level_0,mean
count,Unnamed: 1_level_1
1,1225275
2,733083
3,417323
4,316224
5,234173
6,170371
7,125466
8,96936
9,80407
10,65860
