In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier

from scipy.spatial import distance

In [32]:
docs_titles = pd.read_csv('docs_titles.tsv',sep='\t')
train_groups = pd.read_csv('train_groups.csv')
train_data=pd.merge(train_groups,docs_titles,on='doc_id',how='left')

In [53]:
def transform_data(data):
    def changeOneTitle(title):
        title=str(title).lower()
        title=''.join(list(filter(lambda x: ('а'<=x and x<='я') or x==' ', title)))
        title=' '.join(filter(lambda x: len(x)>3 ,title.split()))
        return title
    data['title'] = np.asarray([changeOneTitle(title) for title in data['title']])
    return data

train_data=transform_data(train_data)

In [54]:
train_data.head(5)

Unnamed: 0,pair_id,group_id,doc_id,target,title
0,1,1,15731,0,замена подшипников ступицы нива
1,2,1,14829,0,оптом сочи сравнить цены купить потребительски...
2,3,1,15764,0,купить ступица лада калина трансмиссия переход...
3,4,1,17669,0,классика
4,5,1,14852,0,ступица нива замена подшипника своими руками


In [83]:
def get_seva_features(train_data,scaler=None):
    traingroups_titledata = {}
    for i in range(len(train_data)):
        new_doc = train_data.iloc[i]
        group_id = new_doc['group_id']
        doc_id = new_doc['doc_id']
        target = new_doc['target']
        title = new_doc['title']
        if group_id not in traingroups_titledata:
            traingroups_titledata[group_id] = []
        traingroups_titledata[group_id].append((group_id,doc_id, title, target))
    y_train = []
    X_train =pd.DataFrame(columns=['group_id','doc_id','target','title_len','sum_comW','mean_comW','std_comW','median_comW',*['comWords'+str(i+1) for i in range(15)]])
    groups_train = []
    for new_group in traingroups_titledata:
        docs = traingroups_titledata[new_group]
        for k, (group_id,doc_id, title, target_id) in enumerate(docs):
            y_train.append(target_id)
            groups_train.append(new_group)
            all_dist = []
            words = set(title.strip().split())
            for j in range(0, len(docs)):
                if k == j:
                    continue
                group_id_j,doc_id_j, title_j, target_j = docs[j]
                words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
            #X_train.append(sorted(all_dist, reverse=True)[0:15])
            mas=np.array(sorted(all_dist, reverse=True)[0:15])
            if scaler is not None:
                mas=scaler.fit_transform(mas.reshape(-1,1))[:,0]
            X_train.loc[len(X_train)] = [group_id,doc_id,target_id,len(title),np.sum(mas),np.mean(mas),np.std(mas),np.median(mas),*mas]
    y_train=np.array(y_train)
    X_train=X_train.astype(int)
    return X_train,y_train,groups_train

In [91]:
def get_TfIdf_features(data,max_features=25):
    vectorizer = TfidfVectorizer(max_features=max_features)
    X=pd.DataFrame(columns=['group_id','doc_id','target','sum_tfIdf','mean_tfIdf','std_tfIdf','median_tfIdf',*['tfIdf'+str(i) for i in range(max_features)]])
    groups_train=[]
    for i, group in data.groupby('group_id'):
        for j in range(len(group)):
            groups_train.append(i)
        titles_tfidf = vectorizer.fit_transform(np.asarray(group['title'])).toarray()
        for k in range(len(titles_tfidf)):
            X.loc[len(X)] = [int(group['group_id'].values[k]),int(group['doc_id'].values[k]),int(group['target'].values[k]),np.sum(titles_tfidf[k,:]),np.mean(titles_tfidf[k,:]),np.std(titles_tfidf[k,:]),np.median(titles_tfidf[k,:]),*titles_tfidf[k,:]]
    X['group_id']=X['group_id'].astype(int)
    X['doc_id']=X['doc_id'].astype(int)
    X['target']=X['target'].astype(int)
    return X,groups_train

In [84]:
X_trainSeva,y_train,groups_train=get_seva_features(train_data)

In [85]:
X_trainSeva.head(5)

Unnamed: 0,group_id,doc_id,target,title_len,sum_comW,mean_comW,std_comW,median_comW,comWords1,comWords2,...,comWords6,comWords7,comWords8,comWords9,comWords10,comWords11,comWords12,comWords13,comWords14,comWords15
0,1,15731,0,31,30,2,0,2,4,3,...,2,2,2,2,2,1,1,1,1,1
1,1,14829,0,54,47,3,1,3,5,5,...,5,4,3,2,2,2,1,1,1,1
2,1,15764,0,76,31,2,0,2,3,2,...,2,2,2,2,2,2,2,2,2,2
3,1,17669,0,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,14852,0,44,26,1,0,2,3,3,...,2,2,2,1,1,1,1,1,1,1


In [56]:
def get_mean_f1score(X_train,y_train,groups_train,model=SVC,*args,**kwargs):
    group_kfold = GroupKFold(n_splits=5)
    mas=[]
    for train_index, val_index in group_kfold.split(X_train, y_train, groups_train):
        #print("TRAIN:", train_index, "TEST:", val_index)
        X_train_train, X_train_val = X_train.iloc[train_index,:], X_train.iloc[val_index,:]
        y_train_train, y_train_val = y_train[train_index], y_train[val_index]
        #print(X_train_train, X_train_test, y_train_train, y_train_test)
        #clf=XGBClassifier(*args,**kwargs) #mean f1_score=0.53
        clf=model(*args,**kwargs)
        clf.fit(X_train_train,y_train_train)
        y_pred_val=clf.predict(X_train_val)
        mas.append(f1_score(y_train_val,y_pred_val))
    return np.mean(mas)
    

In [92]:
X_trainTfIdf,groups_train=get_TfIdf_features(train_data,max_features=25)

In [93]:
X_trainTfIdf.head(5)

Unnamed: 0,group_id,doc_id,target,sum_tfIdf,mean_tfIdf,std_tfIdf,median_tfIdf,tfIdf0,tfIdf1,tfIdf2,...,tfIdf15,tfIdf16,tfIdf17,tfIdf18,tfIdf19,tfIdf20,tfIdf21,tfIdf22,tfIdf23,tfIdf24
0,1,15731,0,1.981909,0.079276,0.183617,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.427037,0.0,0.0,0.0,0.0,0.0
1,1,14829,0,2.219554,0.088782,0.179214,0.0,0.0,0.0,0.0,...,0.0,0.0,0.455972,0.0,0.0,0.48424,0.0,0.0,0.432439,0.0
2,1,15764,0,2.198619,0.087945,0.179627,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.356606,0.0,0.0,0.52908,0.0,0.0
3,1,17669,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,14852,0,2.20886,0.088354,0.179425,0.0,0.0,0.0,0.0,...,0.49529,0.49529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
X_train=pd.merge(X_trainSeva,X_trainTfIdf,on=['group_id','doc_id','target'])

In [95]:
X_train.head(5)

Unnamed: 0,group_id,doc_id,target,title_len,sum_comW,mean_comW,std_comW,median_comW,comWords1,comWords2,...,tfIdf15,tfIdf16,tfIdf17,tfIdf18,tfIdf19,tfIdf20,tfIdf21,tfIdf22,tfIdf23,tfIdf24
0,1,15731,0,31,30,2,0,2,4,3,...,0.0,0.0,0.0,0.0,0.427037,0.0,0.0,0.0,0.0,0.0
1,1,14829,0,54,47,3,1,3,5,5,...,0.0,0.0,0.455972,0.0,0.0,0.48424,0.0,0.0,0.432439,0.0
2,1,15764,0,76,31,2,0,2,3,2,...,0.0,0.0,0.0,0.0,0.356606,0.0,0.0,0.52908,0.0,0.0
3,1,17669,0,8,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,14852,0,44,26,1,0,2,3,3,...,0.49529,0.49529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
get_mean_f1score(X_train.iloc[:,3:],X_train['target'],groups_train,C=10, gamma=0.001, kernel='rbf', class_weight='balanced')

0.6682076579302091

In [97]:
get_mean_f1score(X_train.iloc[:,3:],X_train['target'],groups_train,XGBClassifier,n_estmators=81,max_depth=2)

0.6389076442780801

In [171]:
def get_cosine_dist(X_train):
    X=pd.DataFrame(columns=['group_id','doc_id','target',*['cosineDist'+str(i+1) for i in range(10)]])
    for i, group in X_train.groupby('group_id'):
        dists=distance.cdist(group.iloc[:,-25:],group.iloc[:,-25:],'cosine')
        dists.sort(axis=1)
        dists=dists[:,1:11]
        for k in range(len(dists)):
            X.loc[len(X)] = [int(group['group_id'].values[k]),int(group['doc_id'].values[k]),int(group['target'].values[k]),*dists[k,:]]
    X['group_id']=X['group_id'].astype(int)
    X['doc_id']=X['doc_id'].astype(int)
    X['target']=X['target'].astype(int)
    X.fillna(100,inplace=True)
    return X

In [174]:
X_trainCosDist=get_cosine_dist(X_train)

In [176]:
X_trainNew=pd.merge(X_train,X_trainCosDist,on=['group_id','doc_id','target'])

In [179]:
X_trainNew.head(5)

Unnamed: 0,group_id,doc_id,target,title_len,sum_comW,mean_comW,std_comW,median_comW,comWords1,comWords2,...,cosineDist1,cosineDist2,cosineDist3,cosineDist4,cosineDist5,cosineDist6,cosineDist7,cosineDist8,cosineDist9,cosineDist10
0,1,15731,0,31,30,2,0,2,4,3,...,0.0,0.099609,0.20148,0.373216,0.390364,0.413854,0.413854,0.466356,0.516676,0.564919
1,1,14829,0,54,47,3,1,3,5,5,...,0.0,0.052216,0.060583,0.062675,0.062675,0.090123,0.206651,0.508559,0.603736,0.653929
2,1,15764,0,76,31,2,0,2,3,2,...,0.396007,0.456472,0.468835,0.519781,0.55437,0.563715,0.563855,0.566753,0.595334,0.609908
3,1,17669,0,8,0,0,0,0,0,0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
4,1,14852,0,44,26,1,0,2,3,3,...,0.34832,0.399799,0.419457,0.419457,0.52319,0.583491,0.601725,0.646186,0.646186,0.675286


In [205]:
get_mean_f1score(X_trainNew.iloc[:,3:],X_trainNew['target'],groups_train,C=10, gamma=0.001, kernel='rbf', class_weight='balanced')

0.6691794017734226

In [182]:
get_mean_f1score(X_trainNew.iloc[:,3:],X_trainNew['target'],groups_train,XGBClassifier,n_estmators=81,max_depth=2)

0.6330379263890943

In [184]:
get_mean_f1score(X_trainNew.iloc[:,3:],X_trainNew['target'],groups_train,RandomForestClassifier,n_estimators=70)

0.627759491738124

In [185]:
def predictBy3models(X_train,y_train,X_test):
    clf1=SVC(C=10, gamma=0.001, kernel='rbf', class_weight='balanced')
    clf1.fit(X_train,y_train)
    y_pred1=clf1.predict(X_test)
    
    clf2=RandomForestClassifier(n_estimators=100,max_depth=5)
    clf2.fit(X_train,y_train)
    y_pred2=clf2.predict(X_test)
    
    clf3=XGBClassifier(n_estimators=75,max_depth=1)
    clf3.fit(X_train,y_train)
    y_pred3=clf3.predict(X_test)
    
    y_pred=(y_pred1+y_pred2+y_pred3)/3
    return np.round(y_pred).astype(int)

In [186]:
def get_mean_f1scoreComb3Models(X_train,y_train,groups_train,func):
    group_kfold = GroupKFold(n_splits=5)
    mas=[]
    for train_index, val_index in group_kfold.split(X_train, y_train, groups_train):
        #print("TRAIN:", train_index, "TEST:", val_index)
        X_train_train, X_train_val = X_train.iloc[train_index,:], X_train.iloc[val_index,:]
        y_train_train, y_train_val = y_train[train_index], y_train[val_index]
        #print(X_train_train, X_train_test, y_train_train, y_train_test)
        #clf=XGBClassifier(*args,**kwargs) #mean f1_score=0.53
        y_pred_val=func(X_train_train,y_train_train,X_train_val)
        mas.append(f1_score(y_train_val,y_pred_val))
    return np.mean(mas)
    

In [187]:
get_mean_f1scoreComb3Models(X_train.iloc[:,3:],y_train,groups_train,predictBy3models)

0.6586667242490465

In [None]:
Обучим модель с выбранными параметрами на ВСЕХ трейновых данных и предскажем на тестовых

In [191]:
def get_seva_featuresTest(train_data,scaler=None):
    traingroups_titledata = {}
    for i in range(len(train_data)):
        new_doc = train_data.iloc[i]
        group_id = new_doc['group_id']
        doc_id = new_doc['doc_id']
        title = new_doc['title']
        if group_id not in traingroups_titledata:
            traingroups_titledata[group_id] = []
        traingroups_titledata[group_id].append((group_id,doc_id, title))
    X_train =pd.DataFrame(columns=['group_id','doc_id','title_len','sum_comW','mean_comW','std_comW','median_comW',*['comWords'+str(i+1) for i in range(15)]])
    for new_group in traingroups_titledata:
        docs = traingroups_titledata[new_group]
        for k, (group_id,doc_id, title) in enumerate(docs):
            all_dist = []
            words = set(title.strip().split())
            for j in range(0, len(docs)):
                if k == j:
                    continue
                group_id_j,doc_id_j, title_j = docs[j]
                words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
            #X_train.append(sorted(all_dist, reverse=True)[0:15])
            mas=np.array(sorted(all_dist, reverse=True)[0:15])
            if scaler is not None:
                mas=scaler.fit_transform(mas.reshape(-1,1))[:,0]
            X_train.loc[len(X_train)] = [group_id,doc_id,len(title),np.sum(mas),np.mean(mas),np.std(mas),np.median(mas),*mas]
    X_train=X_train.astype(int)
    return X_train

In [192]:
def get_TfIdf_featuresTest(data,max_features=25):
    vectorizer = TfidfVectorizer(max_features=max_features)
    X=pd.DataFrame(columns=['group_id','doc_id','sum_tfIdf','mean_tfIdf','std_tfIdf','median_tfIdf',*['tfIdf'+str(i) for i in range(max_features)]])
    for i, group in data.groupby('group_id'):
        titles_tfidf = vectorizer.fit_transform(np.asarray(group['title'])).toarray()
        for k in range(len(titles_tfidf)):
            X.loc[len(X)] = [int(group['group_id'].values[k]),int(group['doc_id'].values[k]),np.sum(titles_tfidf[k,:]),np.mean(titles_tfidf[k,:]),np.std(titles_tfidf[k,:]),np.median(titles_tfidf[k,:]),*titles_tfidf[k,:]]
    X['group_id']=X['group_id'].astype(int)
    X['doc_id']=X['doc_id'].astype(int)
    return X

In [193]:
test_groups = pd.read_csv('test_groups.csv')
test_data=pd.merge(test_groups,docs_titles,on='doc_id',how='left')
test_data=transform_data(test_data)
X_test1= get_seva_featuresTest(test_data)
X_test2= get_TfIdf_featuresTest(test_data)
X_test=pd.merge(X_test1,X_test2,on=['group_id','doc_id'])

X_test

Unnamed: 0,group_id,doc_id,title_len,sum_comW,mean_comW,std_comW,median_comW,comWords1,comWords2,comWords3,...,tfIdf15,tfIdf16,tfIdf17,tfIdf18,tfIdf19,tfIdf20,tfIdf21,tfIdf22,tfIdf23,tfIdf24
0,130,6710,28,28,1,0,2,2,2,2,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,130,4030,43,9,0,0,1,2,1,1,...,0.000000,0.692333,0.000000,0.721578,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,130,5561,58,28,1,0,2,3,2,2,...,0.000000,0.000000,0.000000,0.000000,0.736489,0.000000,0.000000,0.000000,0.000000,0.000000
3,130,4055,25,25,1,0,2,2,2,2,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,130,4247,34,16,1,0,1,2,1,1,...,0.000000,0.595615,0.000000,0.000000,0.000000,0.000000,0.471221,0.000000,0.000000,0.000000
5,130,5983,14,4,0,0,0,2,2,0,...,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,130,5784,4,3,0,0,0,1,1,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,130,4700,54,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,130,4093,64,31,2,1,2,6,6,2,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.446182,0.000000,0.000000
9,130,6487,40,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [194]:
clf=SVC(C=10, gamma=0.001, kernel='rbf', class_weight='balanced')
%time clf.fit(X_train.iloc[:,3:],y_train)
y_predTest=clf.predict(X_test.iloc[:,2:])

new_df=test_data.copy()
new_df['target']=y_predTest.astype(int)
new_df.drop(columns=['group_id','doc_id','title'],inplace=True)
new_df.to_csv("y_predTest.csv",index=False)

CPU times: user 12.5 s, sys: 460 ms, total: 12.9 s
Wall time: 15.4 s


In [196]:
def get_cosine_distTest(X_test):
    X=pd.DataFrame(columns=['group_id','doc_id',*['cosineDist'+str(i+1) for i in range(10)]])
    for i, group in X_test.groupby('group_id'):
        dists=distance.cdist(group.iloc[:,-25:],group.iloc[:,-25:],'cosine')
        dists.sort(axis=1)
        dists=dists[:,1:11]
        for k in range(len(dists)):
            X.loc[len(X)] = [int(group['group_id'].values[k]),int(group['doc_id'].values[k]),*dists[k,:]]
    X['group_id']=X['group_id'].astype(int)
    X['doc_id']=X['doc_id'].astype(int)
    X.fillna(100,inplace=True)
    return X

In [198]:
X_testCosDist=get_cosine_distTest(X_test)

In [199]:
X_testNew=pd.merge(X_test,X_testCosDist,on=['group_id','doc_id'])

In [204]:
y_predTestNew=predictBy3models(X_trainNew.iloc[:,3:],X_trainNew['target'],X_testNew.iloc[:,2:])

new_df=test_data.copy()
new_df['target']=y_predTestNew.astype(int)
new_df.drop(columns=['group_id','doc_id','title'],inplace=True)
new_df.to_csv("y_predTestNew.csv",index=False)

In [206]:
clf=SVC(C=10, gamma=0.001, kernel='rbf', class_weight='balanced')
%time clf.fit(X_trainNew.iloc[:,3:],X_trainNew['target'])
y_predTest=clf.predict(X_testNew.iloc[:,2:])

new_df=test_data.copy()
new_df['target']=y_predTest.astype(int)
new_df.drop(columns=['group_id','doc_id','title'],inplace=True)
new_df.to_csv("y_predTest.csv",index=False)

CPU times: user 15.9 s, sys: 326 ms, total: 16.3 s
Wall time: 16.5 s
