In [3]:
import nltk
nltk.download('stopwords')
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import xgboost

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Стемматизация (SnowballStemmer) + HashingVectorizer

## предобработка

In [94]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [95]:
df.drop(['id','context'],axis=1,inplace=True)
df_test.drop(['id','context'],axis=1,inplace=True)

In [99]:
from nltk.stem.snowball import SnowballStemmer

In [100]:
#стеммизация
def stem_words(arr_words, stemmer):
    return ' '.join(map(stemmer.stem, arr_words.split(' ')))

In [101]:
snow_stemmer = SnowballStemmer(language='english')
df['anchor'] = df.apply(lambda x: stem_words(x['anchor'], snow_stemmer), axis=1)
df['target'] = df.apply(lambda x: stem_words(x['target'], snow_stemmer), axis=1)

In [102]:
df.head()

Unnamed: 0,anchor,target,score
0,abat,abat of pollut,0.5
1,abat,act of abat,0.75
2,abat,activ catalyst,0.25
3,abat,elimin process,0.5
4,abat,forest region,0.0


In [103]:
snow_stemmer = SnowballStemmer(language='english')
df_test['anchor'] = df_test.apply(lambda x: stem_words(x['anchor'], snow_stemmer), axis=1)
df_test['target'] = df_test.apply(lambda x: stem_words(x['target'], snow_stemmer), axis=1)

In [104]:
df_test.head()

Unnamed: 0,anchor,target
0,opc drum,inorgan photoconductor drum
1,adjust gas flow,alter gas flow
2,lower trunnion,lower locat
3,cap compon,upper portion
4,neural stimul,artifici neural network


## HashingVectorizer

In [105]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=2**8,stop_words='english')
df['anchor_vector'] = vectorizer.transform(df['anchor']).toarray().tolist()
df['target_vector'] = vectorizer.transform(df['target']).toarray().tolist()
df_test['anchor_vector'] = vectorizer.transform(df_test['anchor']).toarray().tolist()
df_test['target_vector'] = vectorizer.transform(df_test['target']).toarray().tolist()

In [106]:
all_words = []
for i in range(df.shape[0]):
    for word in df['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df['target'].iloc[i].split(' '):
        all_words.append(word)
for i in range(df_test.shape[0]):
    for word in df_test['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df_test['target'].iloc[i].split(' '):
        all_words.append(word)

all_words = list(set(all_words))
all_words_dict = {k:v for v, k in enumerate(all_words)}

In [107]:
print('уникальных слов:', len(all_words))

уникальных слов: 6178


In [108]:
df.head()

Unnamed: 0,anchor,target,score,anchor_vector,target_vector
0,abat,abat of pollut,0.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,abat,act of abat,0.75,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,abat,activ catalyst,0.25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,abat,elimin process,0.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.70..."
4,abat,forest region,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## функции расстояния


In [109]:
from sklearn.metrics import DistanceMetric

In [110]:
metrics = ['euclidean','manhattan','chebyshev','canberra','braycurtis','jaccard']
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df[i] = df.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [111]:
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df_test[i] = df_test.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df_test.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [112]:
df.head()

Unnamed: 0,score,euclidean,manhattan,chebyshev,canberra,braycurtis,jaccard
0,0.5,0.765367,1.0,0.707107,1.171573,0.414214,0.5
1,0.75,0.765367,1.0,0.707107,1.171573,0.414214,0.5
2,0.25,1.414214,2.414214,1.0,3.0,1.0,1.0
3,0.5,1.414214,2.414214,1.0,3.0,1.0,1.0
4,0.0,1.414214,2.414214,1.0,3.0,1.0,1.0


In [113]:
from sklearn.model_selection import KFold

In [114]:
def compute_metrics(pred, true):
    return np.corrcoef(pred, true)[0][1]

In [115]:
#df_test.drop(['score'],axis=1,inplace=True)

## score

In [116]:
n_splits=5
kf = KFold(n_splits, shuffle=True)
X = df.drop(['score'],axis=1)
y = df['score']
preds = [0]*len(df_test)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = xgboost.XGBRegressor(n_estimators = 10000, objective='reg:squarederror',  
                            random_state=35, booster="gbtree", 
                            learning_rate=0.05, max_depth = 7)#,min_child_weight=5, grow_policy='lossguide')
    clf.fit(X_train, y_train, early_stopping_rounds=10, 
                                eval_set=[(X_train, y_train),(X_test, y_test)], verbose=False)
      
    print('train score:', compute_metrics(y_train, clf.predict(X_train)), 'test score:', compute_metrics(y_test, clf.predict(X_test)))
    preds += clf.predict(df_test.values)

df_test['score'] = preds/n_splits

train score: 0.5158078948171348 test score: 0.49870674779653357
train score: 0.5094691860750974 test score: 0.5242949188886792
train score: 0.5143267668935342 test score: 0.503975536481987
train score: 0.5140024379651565 test score: 0.5060353070095283
train score: 0.5121038115500277 test score: 0.5138701723610202


In [117]:
sorted_idx = np.argsort(clf.feature_importances_)[::-1]
print('Top of feature importance')
for index in sorted_idx:
    print(X.columns[index], clf.feature_importances_[index])

Top of feature importance
braycurtis 0.8953936
euclidean 0.06983627
chebyshev 0.025797755
manhattan 0.0044141496
canberra 0.0024605396
jaccard 0.0020976523


# Лемматизация+CountVectorizer

## предобработка

In [4]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [5]:
df.drop(['id','context'],axis=1,inplace=True)
df_test.drop(['id','context'],axis=1,inplace=True)

In [6]:
#удаляем стоп-слова
def words_of_column(s):
    if len(s.split(' ')) == 1:
        return s
    return ' '.join([w for w in s.split(' ') if w not in stopwords.words('english')])

In [7]:
from nltk.stem import WordNetLemmatizer 

In [15]:
# !sudo pip install nltk
# nltk.download('all')

## лемматизация

In [18]:
from nltk.tokenize import word_tokenize

In [19]:
#лемматизация
def lem_words(arr_words, lemmatizer):
    word_list = word_tokenize(arr_words)
    return ' '.join([lemmatizer.lemmatize(w) for w in word_list])

In [20]:
df['anchor']= df.apply(lambda x: words_of_column(x['anchor']), axis=1)
df['target']= df.apply(lambda x: words_of_column(x['target']), axis=1)

lemmatizer = WordNetLemmatizer()
df['anchor'] = df.apply(lambda x: lem_words(x['anchor'], lemmatizer), axis=1)
df['target'] = df.apply(lambda x: lem_words(x['target'], lemmatizer), axis=1)

In [21]:
df.head()

Unnamed: 0,anchor,target,score
0,abatement,abatement pollution,0.5
1,abatement,act abating,0.75
2,abatement,active catalyst,0.25
3,abatement,eliminating process,0.5
4,abatement,forest region,0.0


In [22]:
df_test['anchor']= df_test.apply(lambda x: words_of_column(x['anchor']), axis=1)
df_test['target']= df_test.apply(lambda x: words_of_column(x['target']), axis=1)

lemmatizer = WordNetLemmatizer()
df_test['anchor'] = df_test.apply(lambda x: lem_words(x['anchor'], lemmatizer), axis=1)
df_test['target'] = df_test.apply(lambda x: lem_words(x['target'], lemmatizer), axis=1)

In [23]:
all_words = []
for i in range(df.shape[0]):
    for word in df['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df['target'].iloc[i].split(' '):
        all_words.append(word)
for i in range(df_test.shape[0]):
    for word in df_test['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df_test['target'].iloc[i].split(' '):
        all_words.append(word)

all_words = list(set(all_words))
all_words_dict = {k:v for v, k in enumerate(all_words)}

In [24]:
print('уникальных слов:', len(all_words))

уникальных слов: 7900


## CountVectorizer+функции расстояния

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=all_words_dict)
df['anchor_vector'] = vectorizer.transform(df['anchor']).toarray().tolist()
df['target_vector'] = vectorizer.transform(df['target']).toarray().tolist()
df_test['anchor_vector'] = vectorizer.transform(df_test['anchor']).toarray().tolist()
df_test['target_vector'] = vectorizer.transform(df_test['target']).toarray().tolist()

In [26]:
df.head()

Unnamed: 0,anchor,target,score,anchor_vector,target_vector
0,abatement,abatement pollution,0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,abatement,act abating,0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,abatement,active catalyst,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,abatement,eliminating process,0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,abatement,forest region,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [27]:
from sklearn.metrics import DistanceMetric

In [28]:
metrics = ['euclidean','manhattan','chebyshev','canberra','braycurtis','jaccard']
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df[i] = df.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [29]:
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df_test[i] = df_test.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df_test.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [30]:
df.head()

Unnamed: 0,score,euclidean,manhattan,chebyshev,canberra,braycurtis,jaccard
0,0.5,1.0,1.0,1.0,1.0,0.333333,0.5
1,0.75,1.732051,3.0,1.0,3.0,1.0,1.0
2,0.25,1.732051,3.0,1.0,3.0,1.0,1.0
3,0.5,1.732051,3.0,1.0,3.0,1.0,1.0
4,0.0,1.732051,3.0,1.0,3.0,1.0,1.0


In [31]:
from sklearn.model_selection import KFold

In [32]:
def compute_metrics(pred, true):
    return np.corrcoef(pred, true)[0][1]

In [33]:
#df_test.drop(['score'],axis=1,inplace=True)

## score

In [34]:
n_splits=5
kf = KFold(n_splits, shuffle=True)
X = df.drop(['score'],axis=1)
y = df['score']
preds = [0]*len(df_test)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = xgboost.XGBRegressor(n_estimators = 10000, objective='reg:squarederror',  
                            random_state=35, booster="gbtree", 
                            learning_rate=0.05, max_depth = 7)#,min_child_weight=5, grow_policy='lossguide')
    clf.fit(X_train, y_train, early_stopping_rounds=10, 
                                eval_set=[(X_train, y_train),(X_test, y_test)], verbose=False)
      
    print('train score:', compute_metrics(y_train, clf.predict(X_train)), 'test score:', compute_metrics(y_test, clf.predict(X_test)))
    preds += clf.predict(df_test.values)

df_test['score'] = preds/n_splits

train score: 0.4677770390588465 test score: 0.4649592694262516
train score: 0.46725370354310475 test score: 0.46898967472358616
train score: 0.4670999711759278 test score: 0.4684114265104127
train score: 0.46884851704307867 test score: 0.46149067353706114
train score: 0.4678999184308214 test score: 0.4651099981801087


In [35]:
sorted_idx = np.argsort(clf.feature_importances_)[::-1]
print('Top of feature importance')
for index in sorted_idx:
    print(X.columns[index], clf.feature_importances_[index])

Top of feature importance
jaccard 0.91874737
euclidean 0.06374107
braycurtis 0.009525339
canberra 0.0073142513
manhattan 0.00046358054
chebyshev 0.0002083725


# Лемматизация+HashingVectorizer

## предобработка

In [36]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [37]:
df.drop(['id','context'],axis=1,inplace=True)
df_test.drop(['id','context'],axis=1,inplace=True)

In [38]:
from nltk.stem import WordNetLemmatizer 

In [39]:
# !sudo pip install nltk
# nltk.download('all')

## лемматизация

In [40]:
#лемматизация
def lem_words(arr_words, lemmatizer):
    word_list = word_tokenize(arr_words)
    return ' '.join([lemmatizer.lemmatize(w) for w in word_list])

In [41]:
lemmatizer = WordNetLemmatizer()
df['anchor'] = df.apply(lambda x: lem_words(x['anchor'], lemmatizer), axis=1)
df['target'] = df.apply(lambda x: lem_words(x['target'], lemmatizer), axis=1)

In [42]:
lemmatizer = WordNetLemmatizer()
df_test['anchor'] = df_test.apply(lambda x: lem_words(x['anchor'], lemmatizer), axis=1)
df_test['target'] = df_test.apply(lambda x: lem_words(x['target'], lemmatizer), axis=1)

## HashingVectorizer

In [43]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=2**8,stop_words='english')
df['anchor_vector'] = vectorizer.transform(df['anchor']).toarray().tolist()
df['target_vector'] = vectorizer.transform(df['target']).toarray().tolist()
df_test['anchor_vector'] = vectorizer.transform(df_test['anchor']).toarray().tolist()
df_test['target_vector'] = vectorizer.transform(df_test['target']).toarray().tolist()

In [44]:
df.head()

Unnamed: 0,anchor,target,score,anchor_vector,target_vector
0,abatement,abatement of pollution,0.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,abatement,act of abating,0.75,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,abatement,active catalyst,0.25,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,abatement,eliminating process,0.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,abatement,forest region,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [45]:
all_words = []
for i in range(df.shape[0]):
    for word in df['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df['target'].iloc[i].split(' '):
        all_words.append(word)
for i in range(df_test.shape[0]):
    for word in df_test['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df_test['target'].iloc[i].split(' '):
        all_words.append(word)

all_words = list(set(all_words))
all_words_dict = {k:v for v, k in enumerate(all_words)}

In [47]:
print('уникальных слов:', len(all_words))

уникальных слов: 7962


## функция расстояния

In [48]:
from sklearn.metrics import DistanceMetric

In [49]:
metrics = ['euclidean','manhattan','chebyshev','canberra','braycurtis','jaccard']
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df[i] = df.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [50]:
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df_test[i] = df_test.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df_test.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [51]:
df.head()

Unnamed: 0,score,euclidean,manhattan,chebyshev,canberra,braycurtis,jaccard
0,0.5,0.765367,1.0,0.707107,1.171573,0.414214,0.5
1,0.75,1.414214,2.414214,1.0,3.0,1.0,1.0
2,0.25,1.414214,2.414214,1.0,3.0,1.0,1.0
3,0.5,1.414214,2.414214,1.0,3.0,1.0,1.0
4,0.0,1.414214,2.414214,1.0,3.0,1.0,1.0


In [52]:
from sklearn.model_selection import KFold

In [53]:
def compute_metrics(pred, true):
    return np.corrcoef(pred, true)[0][1]

In [54]:
#df_test.drop(['score'],axis=1,inplace=True)

In [None]:
df_test.head()

Unnamed: 0,anchor,target
0,opc drum,inorgan photoconductor drum
1,adjust gas flow,alter gas flow
2,lower trunnion,lower locat
3,cap compon,upper portion
4,neural stimul,artifici neural network


In [None]:
all_words = []
for i in range(df.shape[0]):
    for word in df['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df['target'].iloc[i].split(' '):
        all_words.append(word)
for i in range(df_test.shape[0]):
    for word in df_test['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df_test['target'].iloc[i].split(' '):
        all_words.append(word)

all_words = list(set(all_words))
all_words_dict = {k:v for v, k in enumerate(all_words)}

In [None]:
print('уникальных слов:', len(all_words))

уникальных слов: 6115


In [None]:
# from sklearn.feature_extraction.text import HashingVectorizer
# corpus = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?']
# vectorizer = HashingVectorizer(n_features=2**4)
# X = vectorizer.fit_transform(corpus)
# print(X.shape)
# X.toarray()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=all_words_dict)
df['anchor_vector'] = vectorizer.transform(df['anchor']).toarray().tolist()
df['target_vector'] = vectorizer.transform(df['target']).toarray().tolist()
df_test['anchor_vector'] = vectorizer.transform(df_test['anchor']).toarray().tolist()
df_test['target_vector'] = vectorizer.transform(df_test['target']).toarray().tolist()

In [None]:
df.head()

Unnamed: 0,anchor,target,score,anchor_vector,target_vector
0,abat,abat pollut,0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,abat,act abat,0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,abat,activ catalyst,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,abat,elimin process,0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,abat,forest region,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
from sklearn.metrics import DistanceMetric

In [None]:
metrics = ['euclidean','manhattan','chebyshev','canberra','braycurtis','jaccard']
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df[i] = df.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [None]:
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df_test[i] = df_test.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df_test.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,score,euclidean,manhattan,chebyshev,canberra,braycurtis,jaccard
0,0.5,1.0,1.0,1.0,1.0,0.333333,0.5
1,0.75,1.0,1.0,1.0,1.0,0.333333,0.5
2,0.25,1.732051,3.0,1.0,3.0,1.0,1.0
3,0.5,1.732051,3.0,1.0,3.0,1.0,1.0
4,0.0,1.732051,3.0,1.0,3.0,1.0,1.0


In [None]:
from sklearn.model_selection import KFold

In [None]:
def compute_metrics(pred, true):
    return np.corrcoef(pred, true)[0][1]

In [None]:
#df_test.drop(['score'],axis=1,inplace=True)

In [None]:
n_splits=5
kf = KFold(n_splits, shuffle=True)
X = df.drop(['score'],axis=1)
y = df['score']
preds = [0]*len(df_test)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = xgboost.XGBRegressor(n_estimators = 10000, objective='reg:squarederror',  
                            random_state=35, booster="gbtree", 
                            learning_rate=0.05, max_depth = 7)#,min_child_weight=5, grow_policy='lossguide')
    clf.fit(X_train, y_train, early_stopping_rounds=10, 
                                eval_set=[(X_train, y_train),(X_test, y_test)], verbose=False)
      
    print('train score:', compute_metrics(y_train, clf.predict(X_train)), 'test score:', compute_metrics(y_test, clf.predict(X_test)))
    preds += clf.predict(df_test.values)

df_test['score'] = preds/n_splits

train score: 0.5198884937104629 test score: 0.5295567636206291
train score: 0.5181117727875999 test score: 0.5367144191386951
train score: 0.5247064982288128 test score: 0.5103444514951672
train score: 0.5216979293085792 test score: 0.5228823604385127
train score: 0.5266651934154329 test score: 0.501328876101527


In [None]:
sorted_idx = np.argsort(clf.feature_importances_)[::-1]
print('Top of feature importance')
for index in sorted_idx:
    print(X.columns[index], clf.feature_importances_[index])

Top of feature importance
braycurtis 0.69376636
euclidean 0.14843552
jaccard 0.1450956
manhattan 0.008893483
canberra 0.0037663353
chebyshev 4.266138e-05


In [None]:
df_test.head()

Unnamed: 0,anchor,target
0,opc drum,inorgan photoconductor drum
1,adjust gas flow,alter gas flow
2,lower trunnion,lower locat
3,cap compon,upper portion
4,neural stimul,artifici neural network


In [None]:
all_words = []
for i in range(df.shape[0]):
    for word in df['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df['target'].iloc[i].split(' '):
        all_words.append(word)
for i in range(df_test.shape[0]):
    for word in df_test['anchor'].iloc[i].split(' '):
        all_words.append(word)
    for word in df_test['target'].iloc[i].split(' '):
        all_words.append(word)

all_words = list(set(all_words))
all_words_dict = {k:v for v, k in enumerate(all_words)}

In [None]:
print('уникальных слов:', len(all_words))

уникальных слов: 6115


In [None]:
# from sklearn.feature_extraction.text import HashingVectorizer
# corpus = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?']
# vectorizer = HashingVectorizer(n_features=2**4)
# X = vectorizer.fit_transform(corpus)
# print(X.shape)
# X.toarray()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=all_words_dict)
df['anchor_vector'] = vectorizer.transform(df['anchor']).toarray().tolist()
df['target_vector'] = vectorizer.transform(df['target']).toarray().tolist()
df_test['anchor_vector'] = vectorizer.transform(df_test['anchor']).toarray().tolist()
df_test['target_vector'] = vectorizer.transform(df_test['target']).toarray().tolist()

In [None]:
df.head()

Unnamed: 0,anchor,target,score,anchor_vector,target_vector
0,abat,abat pollut,0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,abat,act abat,0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,abat,activ catalyst,0.25,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,abat,elimin process,0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,abat,forest region,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
from sklearn.metrics import DistanceMetric

In [None]:
metrics = ['euclidean','manhattan','chebyshev','canberra','braycurtis','jaccard']
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df[i] = df.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [None]:
for i in metrics:
    dist = DistanceMetric.get_metric(i)
    df_test[i] = df_test.apply(lambda x: dist.pairwise([x['anchor_vector'],x['target_vector']])[0][1], axis=1)
df_test.drop(['anchor','target','anchor_vector','target_vector'],axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,score,euclidean,manhattan,chebyshev,canberra,braycurtis,jaccard
0,0.5,1.0,1.0,1.0,1.0,0.333333,0.5
1,0.75,1.0,1.0,1.0,1.0,0.333333,0.5
2,0.25,1.732051,3.0,1.0,3.0,1.0,1.0
3,0.5,1.732051,3.0,1.0,3.0,1.0,1.0
4,0.0,1.732051,3.0,1.0,3.0,1.0,1.0


In [None]:
from sklearn.model_selection import KFold

In [None]:
def compute_metrics(pred, true):
    return np.corrcoef(pred, true)[0][1]

In [None]:
#df_test.drop(['score'],axis=1,inplace=True)

In [None]:
n_splits=5
kf = KFold(n_splits, shuffle=True)
X = df.drop(['score'],axis=1)
y = df['score']
preds = [0]*len(df_test)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = xgboost.XGBRegressor(n_estimators = 10000, objective='reg:squarederror',  
                            random_state=35, booster="gbtree", 
                            learning_rate=0.05, max_depth = 7)#,min_child_weight=5, grow_policy='lossguide')
    clf.fit(X_train, y_train, early_stopping_rounds=10, 
                                eval_set=[(X_train, y_train),(X_test, y_test)], verbose=False)
      
    print('train score:', compute_metrics(y_train, clf.predict(X_train)), 'test score:', compute_metrics(y_test, clf.predict(X_test)))
    preds += clf.predict(df_test.values)

df_test['score'] = preds/n_splits

train score: 0.5198884937104629 test score: 0.5295567636206291
train score: 0.5181117727875999 test score: 0.5367144191386951
train score: 0.5247064982288128 test score: 0.5103444514951672
train score: 0.5216979293085792 test score: 0.5228823604385127
train score: 0.5266651934154329 test score: 0.501328876101527


In [None]:
sorted_idx = np.argsort(clf.feature_importances_)[::-1]
print('Top of feature importance')
for index in sorted_idx:
    print(X.columns[index], clf.feature_importances_[index])

Top of feature importance
braycurtis 0.69376636
euclidean 0.14843552
jaccard 0.1450956
manhattan 0.008893483
canberra 0.0037663353
chebyshev 4.266138e-05


## score

In [55]:
n_splits=5
kf = KFold(n_splits, shuffle=True)
X = df.drop(['score'],axis=1)
y = df['score']
preds = [0]*len(df_test)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = xgboost.XGBRegressor(n_estimators = 10000, objective='reg:squarederror',  
                            random_state=35, booster="gbtree", 
                            learning_rate=0.05, max_depth = 7)#,min_child_weight=5, grow_policy='lossguide')
    clf.fit(X_train, y_train, early_stopping_rounds=10, 
                                eval_set=[(X_train, y_train),(X_test, y_test)], verbose=False)
      
    print('train score:', compute_metrics(y_train, clf.predict(X_train)), 'test score:', compute_metrics(y_test, clf.predict(X_test)))
    preds += clf.predict(df_test.values)

df_test['score'] = preds/n_splits

train score: 0.4614249163163344 test score: 0.44442972665837144
train score: 0.45811313460996367 test score: 0.4579167586862909
train score: 0.45947932423615334 test score: 0.45352506003698645
train score: 0.4587769736661343 test score: 0.4558724790539259
train score: 0.45697446731283314 test score: 0.46262158581704005


In [56]:
sorted_idx = np.argsort(clf.feature_importances_)[::-1]
print('Top of feature importance')
for index in sorted_idx:
    print(X.columns[index], clf.feature_importances_[index])

Top of feature importance
braycurtis 0.89294845
euclidean 0.05710956
chebyshev 0.027181186
canberra 0.011528077
manhattan 0.010238926
jaccard 0.0009937695
