In [1]:
%run cv.ipynb

0


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_texts_features = pd.read_pickle(output_path + '/train_text_features3')

In [85]:
train_data = pd.read_pickle(output_path + '/train_data')
train_data = train_data.join(train_texts_features.set_index('objectId'), how='inner', on='objectId')

In [86]:
def predict_to_submit(X, proba):
    X = X.assign(weight=-proba)
    scores = X[['instanceId_userId', 'objectId', 'weight']] \
            .groupby(['instanceId_userId','objectId']).mean()
    result = scores.sort_values(by=['instanceId_userId', 'weight']).reset_index()
    # Collect predictions for each user
    submit = result.groupby("instanceId_userId")['objectId'].apply(list)
    return submit

class SimpleModel(MyModel):
    def get_X(self, data):
        X = data.drop(['objectId', 'instanceId_userId'], 1)
        if 'label' in data.columns:
            X.drop('label', 1, inplace=True)
        if 'emb_cosine_dist' in data.columns:
            X.drop('emb_cosine_dist', 1, inplace=True)
        return X
    def fit(self, data):
        X = self.get_X(data)
        y = data['label'].values
        #self.model = LogisticRegression(random_state=123, solver='lbfgs').fit(X, y)
        self.model = MLPClassifier(alpha=.0002, hidden_layer_sizes=(25,), max_iter=150,
                                   learning_rate_init=0.001, random_state=322).fit(X, y)
    def predict(self, X):
        proba_result = self.model.predict_proba(self.get_X(X))
        return proba_result[:, 1]
    
class LgbModel(SimpleModel):
    def __init__(self, params):
        self.params = params
    def fit(self, data):
        lgb_train = lgb.Dataset(self.get_X(data), data['label'].values)
        #categorical_feature=['objectType', 'clientType']
        
        #if self.verbose >= 2:
        print('Starting train: %s' % datetime.datetime.now())
        params = self.params.copy()
        num_boost_round = params['num_boost_round']
        del params['num_boost_round']
        #params['objective'] = 'binary'
        params['metric'] = 'auc'
        self.model = lgb.train(
            params,
            lgb_train,
            num_boost_round=num_boost_round
        )
    def predict(self, X):
        proba = self.model.predict(self.get_X(X))
        return proba


In [87]:
def feat(x):
    n = x.shape[0]
    return np.add.reduce(x.embedding.values) / n

def create_features(data):
    ones = np.repeat(1, data.shape[0])
    res = pd.DataFrame({
        'instanceId_userId': data['instanceId_userId'],
        'objectId': data['objectId'],
        'object_type': data['instanceId_objectType'],
        'client_type': data['audit_clientType'],
        'len': data['len'],
        'p_len': data['p_len'],
        #'plen_per_len': data['p_len'] / data['len'],
        'q_count': data['q_count'],
        'links_count': data['links_count'],
        'emojis_rate': data['emojis_count'] / data['len'],
        'upper_rate': data['upper_count'] / data['len'],
        'ok_videos_count': data['ok_videos_count'].clip(upper=1),
        'ok_groups_count': data['ok_groups_count'].clip(upper=1),
        'youtube_count': data['youtube_count'].clip(upper=1),
        'is_adv': data['is_adv'],
        'is_recipe': data['is_recipe'],
        #'is_ru': data['lang'] == 'ru',
    })
    #qq = data[['instanceId_userId', 'embedding']].join(
    #        data.groupby('instanceId_userId').apply(feat).reset_index().set_index('instanceId_userId'), 
    #        on='instanceId_userId')
    #print('Cosing:')
    #res['emb_dist'] = (qq.iloc[:,1] - qq.iloc[:,2]).apply(np.linalg.norm)
    #res['emb_cosine_dist'] = qq.iloc[:,1:3].apply(lambda x: scipy.spatial.distance.cosine(*x), axis=1)
    #print('Embs:')
    emb = np.stack(data['embedding'])
    for j in range(emb.shape[1]):
        res['emb%d' % j] = emb[:,j]

    if 'label' in data.columns:
        res['label'] = data['label']
            
    return res

In [88]:
%%time
train = create_features(train_data.head(10000))

CPU times: user 26.3 ms, sys: 78.7 ms, total: 105 ms
Wall time: 111 ms


In [89]:
%%time
train = create_features(train_data)
del train_data

CPU times: user 32.9 s, sys: 6.82 s, total: 39.7 s
Wall time: 41.8 s


In [90]:
%%time
#################################### PARTIAL TRAIN
for min_data_in_leaf in (20,):
    for lambda_l2 in (0.0,):
        for learning_rate in (0.5,):
            for feature_fraction in (1,):
                for num_boost_round in (1000,):
                    for num_leaves in (24,):
                        print((min_data_in_leaf,lambda_l2,learning_rate,feature_fraction,num_boost_round,num_leaves))
                        cross_validation(LgbModel({
                            'boosting_type': 'gbdt',
                            'min_data_in_leaf': min_data_in_leaf,
                            'lambda_l2': lambda_l2,
                            'num_leaves': num_leaves,
                            'learning_rate': learning_rate,
                            'feature_fraction': feature_fraction,
                            'bagging_fraction': 1,
                            'bagging_freq': 5,
                            'num_boost_round': num_boost_round,
                            'verbose': 0
                        }), train[train.instanceId_userId < 4000000], n_iters=1, verbose=2)

(20, 0.0, 0.5, 1, 1000, 24)
KFold(n_splits=5, random_state=2707, shuffle=True)
Prepare data: 2019-02-24 14:02:53.759142
Fit: 2019-02-24 14:02:56.483555
Starting train: 2019-02-24 14:02:57.127578
Predict: 2019-02-24 14:04:16.175734
Auc: 2019-02-24 14:04:24.668711
 0 - 1 : 0.6415, mean=0.6415
Prepare data: 2019-02-24 14:04:24.880791
Fit: 2019-02-24 14:04:27.399534
Starting train: 2019-02-24 14:04:28.108458


KeyboardInterrupt: 

In [84]:
%%time
#################################### PARTIAL TRAIN
for min_data_in_leaf in (20,):
    for lambda_l2 in (0.0,):
        for learning_rate in (0.5,):
            for feature_fraction in (1,):
                for num_boost_round in (1000,):
                    for num_leaves in (24,):
                        print((min_data_in_leaf,lambda_l2,learning_rate,feature_fraction,num_boost_round,num_leaves))
                        cross_validation(LgbModel({
                            'boosting_type': 'gbdt',
                            'min_data_in_leaf': min_data_in_leaf,
                            'lambda_l2': lambda_l2,
                            'num_leaves': num_leaves,
                            'learning_rate': learning_rate,
                            'feature_fraction': feature_fraction,
                            'bagging_fraction': 1,
                            'bagging_freq': 5,
                            'num_boost_round': num_boost_round,
                            'verbose': 0
                        }), train[train.instanceId_userId < 4000000], n_iters=1, verbose=2)

(20, 0.0, 0.5, 1, 1000, 24)
KFold(n_splits=5, random_state=2707, shuffle=True)
Prepare data: 2019-02-24 13:47:54.297978
Fit: 2019-02-24 13:47:57.158977
Starting train: 2019-02-24 13:47:58.030261
Predict: 2019-02-24 13:49:28.884893
Auc: 2019-02-24 13:49:37.500645
 0 - 1 : 0.6442, mean=0.6442
Prepare data: 2019-02-24 13:49:37.721344
Fit: 2019-02-24 13:49:40.630414
Starting train: 2019-02-24 13:49:41.519071
Predict: 2019-02-24 13:51:12.855549
Auc: 2019-02-24 13:51:20.578160
 0 - 2 : 0.6442, mean=0.6442
Prepare data: 2019-02-24 13:51:20.765002
Fit: 2019-02-24 13:51:24.247095
Starting train: 2019-02-24 13:51:25.070531
Predict: 2019-02-24 13:52:57.522498
Auc: 2019-02-24 13:53:05.454003
 0 - 3 : 0.6460, mean=0.6448
Prepare data: 2019-02-24 13:53:05.668596
Fit: 2019-02-24 13:53:08.874558
Starting train: 2019-02-24 13:53:09.680502
Predict: 2019-02-24 13:54:53.523040
Auc: 2019-02-24 13:55:01.769279
 0 - 4 : 0.6432, mean=0.6444
Prepare data: 2019-02-24 13:55:01.983637
Fit: 2019-02-24 13:55:05.207

In [8]:
%%time
for min_data_in_leaf in (20,):
    for lambda_l2 in (0.0,):
        for learning_rate in (0.5,):
            for feature_fraction in (1,):
                for num_boost_round in (1000,):
                    for num_leaves in (24,):
                        print((min_data_in_leaf,lambda_l2,learning_rate,feature_fraction,num_boost_round,num_leaves))
                        cross_validation(LgbModel({
                            'boosting_type': 'gbdt',
                            'min_data_in_leaf': min_data_in_leaf,
                            'lambda_l2': lambda_l2,
                            'num_leaves': num_leaves,
                            'learning_rate': learning_rate,
                            'feature_fraction': feature_fraction,
                            'bagging_fraction': 1,
                            'bagging_freq': 5,
                            'num_boost_round': num_boost_round,
                            'verbose': 0
                        }), train, n_iters=1, verbose=2)

(20, 0.0, 0.5, 1, 1000, 24)
KFold(n_splits=5, random_state=2707, shuffle=True)
Prepare data: 2019-02-24 12:05:41.973077
Fit: 2019-02-24 12:06:00.707723
Starting train: 2019-02-24 12:06:09.171211
Predict: 2019-02-24 12:14:52.756500
Auc: 2019-02-24 12:15:26.120855
 0 - 1 : 0.6433, mean=0.6433
Prepare data: 2019-02-24 12:15:27.166216


KeyboardInterrupt: 

In [11]:
test_texts_features = pd.read_pickle(output_path + '/test_text_features3')

In [12]:
test_data = pd.read_pickle(output_path + '/test_data')
test_data = test_data.join(test_texts_features.set_index('objectId'), how='inner', on='objectId')

  labels, = index.labels


In [13]:
test = create_features(test_data)

In [15]:
model = LgbModel({
    'boosting_type': 'gbdt',
    'min_data_in_leaf': 20,
    'lambda_l2': 0.0,
    'num_leaves': 24,
    'learning_rate': 0.5,
    'feature_fraction': 1,
    'bagging_fraction': 1,
    'bagging_freq': 5,
    'num_boost_round': 1000,
    'verbose': 0
})
model.fit(train)
pred = model.predict(test)

Starting train: 2019-02-23 22:09:04.655762


In [16]:
submit = predict_to_submit(test, pred)
submit.to_csv(output_path + "/textSubmit1.csv.gz", header = False, compression='gzip')

In [17]:
test.columns

Index(['instanceId_userId', 'objectId', 'objectType', 'clientType', 'len',
       'p_len', 'q_count', 'links_count', 'emojis_rate', 'upper_rate',
       'ok_videos_count', 'ok_groups_count', 'youtube_count', 'is_adv',
       'is_recipe', 'emb0', 'emb1', 'emb2', 'emb3', 'emb4', 'emb5', 'emb6',
       'emb7', 'emb8', 'emb9', 'emb10', 'emb11', 'emb12', 'emb13', 'emb14',
       'emb15', 'emb16', 'emb17', 'emb18', 'emb19', 'emb20', 'emb21', 'emb22'],
      dtype='object')

In [116]:
lens_stat = defaultdict(int)
for r in submit:
    lens_stat[len(r)] += 1
lens_stat = pd.DataFrame([(k, v) for k, v in lens_stat.items()], columns=['len', 'count'])
lens_stat['relative_count'] = lens_stat['count'] / lens_stat['count'].sum()
lens_stat.head()

Unnamed: 0,len,count,relative_count
0,2,66422,0.318806
1,3,43664,0.209574
2,4,27742,0.133154
3,8,6431,0.030867
4,16,913,0.004382


In [307]:
submit.head()

instanceId_userId
316                                 [37758420, 17997084]
631                       [38118098, 30513650, 15478935]
742             [28816291, 34685448, 24302446, 10672856]
868    [35655697, 30143153, 11640701, 29650308, 30882...
979                                  [37950972, 7996257]
Name: objectId, dtype: object

In [29]:
st = train[train.instanceId_userId < 100000]
cross_validation(LgbModel, st, n_iters=1)

KFold(n_splits=5, random_state=2707, shuffle=True)
[0.09032184 0.05449515 0.05449515 0.05449515 0.11524286]
 0 - 1 : 0.6101, mean=0.6101
[0.08773912 0.08885853 0.08787986 0.08885853 0.08787986]
 0 - 2 : 0.6234, mean=0.6167
[0.18378015 0.13601413 0.19162637 0.20648389 0.19926677]
 0 - 3 : 0.6342, mean=0.6226
[0.09090399 0.08185244 0.10167038 0.07653604 0.10629639]
 0 - 4 : 0.6266, mean=0.6236
[0.1445142  0.15797199 0.05879618 0.12857328 0.06304604]
 0 - 5 : 0.6053, mean=0.6199


In [89]:
del q_group
q = train[['instanceId_userId', 'label']].groupby('instanceId_userId').agg({
    'label': ['mean', 'count']
})

In [98]:
np.intersect1d(train.instanceId_userId.unique(), test.instanceId_userId.unique()).shape

(198778,)

In [99]:
train.instanceId_userId.unique().shape

(3902235,)

In [100]:
test.instanceId_userId.unique().shape

(208346,)

In [113]:
train.emb1.mean()

-0.09304651

In [114]:
q.head()

Unnamed: 0_level_0,label,label
Unnamed: 0_level_1,mean,count
instanceId_userId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,0.0,3
7,0.666667,3
13,0.0,7
16,0.0,1
19,0.0,1


In [121]:
q['label'].groupby('count').count()

Unnamed: 0_level_0,mean
count,Unnamed: 1_level_1
1,1225275
2,733083
3,417323
4,316224
5,234173
6,170371
7,125466
8,96936
9,80407
10,65860


In [339]:
test_texts2 = parquet.read_table(input_path + '/texts/textsTest/').to_pandas()

  labels, = index.labels


In [349]:
test_texts2.loc[3000:3020,:]

Unnamed: 0,objectId,lang,text,preprocessed
3000,9448596,Unknown,Алёна Иванцова - Зажигай это лето Организация ...,"[алён, иванцов, зажига, эт, лет, организац, ко..."
3001,36153107,ru,Этот красавец неожиданно лишился дома и теперь...,"[красавец, неожида, лиш, дом, вынужд, жит, эт,..."
3002,1089971,ru,SJ - 93 серия (нац.версия) - субтитры,"[sj, сер, нац, верс, субтитр]"
3003,25048905,ru,\nУлыбайтесь вместе с нами :)) ok.ru/group/53...,"[улыба, вмест, нам]"
3004,34695987,ru,http://iqformat.me/zagadki/kogda-beremennoj-zh...,"[удивительн, истор, случ, семейств, хьюз, бере..."
3005,8378836,be,Лейка малая №4. Размер 140x110x115 мм. Арт. 39...,"[лейк, мал, размер, x, x, мм, арт, цен, руб]"
3006,25569976,tg,Барои хушгузарони парандахоро хадаф нагиред!!!...,"[баро, хушгузарон, парандахор, хадаф, нагиред,..."
3007,19957643,ru,"Иллюзии\nХочется уйти от проблем, \nОт несправ...","[иллюз, хочет, уйт, пробл, несправедлив, мир, ..."
3008,19681599,ru,\nСделано с любовью....,"[сдела, любов]"
3009,23448361,ru,Озеро Каракуль Озеро Каракуль – самое большое ...,"[озер, каракул, озер, каракул, сам, больш, лед..."


In [343]:
test_texts2.loc[1009,:].text

'ok.ru/group/56803476373541 ok.ru/lyualih ok.ru/group/54129162977280 ok.ru/luxwedding ok.ru/neverf ok.ru/group/52788231667949 ok.ru/group/51956903903450 ok.ru/bolitdusha ok.ru/group/51831127343260 ok.ru/samirlend'

In [346]:
test_texts2.loc[1003,:].text

'Stive Morgan - White Angel 329181235953 ok.ru/video/597182714175'

In [348]:
test_texts2.loc[2019,:].text

'ok.ru/devichnik.online ok.ru/group/54243312599040 ok.ru/kinohumor.hd ok.ru/group/54309735104525 ok.ru/ideal.woman ok.ru/group/53098685726825 ok.ru/group/52841002959083 ok.ru/group52841002959083 ok.ru/sdelaysamo ok.ru/group/52156995403844'

In [350]:
test_texts2.loc[3014,:].text

'ok.ru/gif.hd ok.ru/group/54589100982286 КАК ПРАВИЛЬНО ОБРЕЗАТЬ ЧЕРЕШНЮ\nСохраните, чтобы не потерять!\nФормирование и обрезка черешни.\n\nЧерешня обладает высокой пробудимостью почек, но побегообразовательная способность у нее слабая.Поэтому у нее ярко выражена стволовость и ярусность в размещении ветвей. Формировать черешню рекомендуется по типу ярусной кроны, которая наиболее полно соответствует особенностям этой культуры.\nНаибольшее распространение получили разреженно-ярусная, вазообразная, полуплоская и веретеновидная.\n\nДля формирования разреженно-ярусной кроны в первом ярусе закладывают три-четыре скелетные ветви, во втором две-три, а в третьем одну-две ветви. Расстояние между ярусами 0,5—0,8 м.\n\nВазообразная (чашевидная) крона состоит из 4—5 основных ветвей первого порядка (центральный проводник отсутствует). Благодаря хорошей осветленности средней части кроны плодовые образования живут 12—15 лет.\nДля интенсивных насаждений со схемой посадки 4—52—2,5 м предлагается веретен