In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install strsimpy

In [None]:
%pylab inline
plt.style.use("bmh")

In [None]:
!pip install cyrtranslit

In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from strsimpy.levenshtein import Levenshtein
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from catboost import CatBoostClassifier, Pool, cv
from tqdm import tqdm
tqdm.pandas()
import seaborn as sns
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pycountry
import re
import cyrtranslit

In [None]:
train = pd.read_csv('/kaggle/input/sibur20-naming-data/train.csv', index_col=0)
test = pd.read_csv('/kaggle/input/sibur20-naming-data/test.csv', index_col=0)
train.shape, test.shape

# word preprocessing

In [None]:
legal_entities = ['ООО', 'ОАО', 'ЗАО', 'ПАО', 'ОДО']

for entity in tqdm(legal_entities):
    train.replace(re.compile(f"\W*{entity}\W*"), "", inplace=True)
    test.replace(re.compile(f"\W*{entity}\W*"), "", inplace=True)

In [None]:
train["name_1"] = train["name_1"].str.lower()
train["name_2"] = train["name_2"].str.lower()

test["name_1"] = test["name_1"].str.lower()
test["name_2"] = test["name_2"].str.lower()

In [None]:
legal_entities = ["ltd\.", "co\.", "inc\.", "b\.v\.", "s\.c\.r\.l\.", "gmbh", "pvt\."]

for entity in tqdm(legal_entities):
    train.replace(re.compile(f"\W*{entity}\W*"), "", inplace=True)
    test.replace(re.compile(f"\W*{entity}\W*"), "", inplace=True)

In [None]:
# ухудшает скор
#shit_words = ['sa', 's a', 'de', 'cv', 'gmb h', 'g mbh', 'llc', 's pa', 'sp a', 'spa', 'ag', 'rl', 's']

#for shit_word in tqdm(shit_words):
#    train.replace(re.compile('\s+{}'.format(shit_word)), "", inplace=True)
#    test.replace(re.compile(f"\s+{shit_word}\s*"), "", inplace=True)

In [None]:
countries = [country.name.lower() for country in pycountry.countries]

for country in tqdm(countries):
    train.replace(re.compile(f"\s+{entity}\s*"), "", inplace=True)
    test.replace(re.compile(f"\s+{entity}\s*"), "", inplace=True)

In [None]:
train["name_1"] = train["name_1"].progress_apply(lambda r: cyrtranslit.to_latin(r, 'ru'))
train["name_2"] = train["name_2"].progress_apply(lambda r: cyrtranslit.to_latin(r, 'ru'))

test["name_1"] = train["name_1"].progress_apply(lambda r: cyrtranslit.to_latin(r, 'ru'))
test["name_2"] = train["name_2"].progress_apply(lambda r: cyrtranslit.to_latin(r, 'ru'))

In [None]:
train.replace(re.compile(r"\s+\(.*\)"), "", inplace=True)
test.replace(re.compile(r"\s+\(.*\)"), "", inplace=True)

In [None]:
train.replace(re.compile(r"[^\w\s]"), "", inplace=True)
test.replace(re.compile(r"[^\w\s]"), "", inplace=True)

In [None]:
train[train.is_duplicate==1].sample(15)

# Feature generation

In [None]:
levenshtein = Levenshtein()
train["levenstein"] = train.progress_apply(lambda r: levenshtein.distance(r.name_1, r.name_2), axis=1)
test["levenstein"] = test.progress_apply(lambda r: levenshtein.distance(r.name_1, r.name_2), axis=1)

In [None]:
normalized_levenshtein = NormalizedLevenshtein()

train["norm_levenstein"] = train.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1, r.name_2),
                                                axis=1)
test["norm_levenstein"] = test.progress_apply(lambda r: normalized_levenshtein.distance(r.name_1, r.name_2),
                                              axis=1)

In [None]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train["jaccard"] = train.progress_apply(lambda r: get_jaccard_sim(r.name_1, r.name_2), axis=1)
test["jaccard"] = test.progress_apply(lambda r: get_jaccard_sim(r.name_1, r.name_2), axis=1)

In [None]:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,5))
def ngramm_distance(str_1, str_2):
    vectorizer.fit([str_1 + ' ' + str_2])
    return np.absolute(vectorizer.transform([str_1]) - vectorizer.transform([str_2])).sum()

In [None]:
train["ngramms"] = train.progress_apply(lambda r: ngramm_distance(r.name_1, r.name_2), axis=1)
test["ngramms"] = test.progress_apply(lambda r: ngramm_distance(r.name_1, r.name_2), axis=1)

# Norm_levenstein

In [None]:
sns.distplot(train[train['is_duplicate'] == 1]['norm_levenstein'], label='ones')
sns.distplot(train[train['is_duplicate'] == 0]['norm_levenstein'], label='zeros')
plt.legend();

# Levenstein

In [None]:
sns.distplot(train[train['is_duplicate'] == 1]['levenstein'], label='ones')
sns.distplot(train[train['is_duplicate'] == 0]['levenstein'], label='zeros')
plt.legend();

# Jaccard

In [None]:
sns.distplot(train[train['is_duplicate'] == 1]['jaccard'], label='ones', kde=False)
sns.distplot(train[train['is_duplicate'] == 0]['jaccard'], label='zeros', kde=False)
plt.legend();

# ngramm distance

In [None]:
sns.distplot(train[train['is_duplicate'] == 1]['ngramms'], label='ones')
sns.distplot(train[train['is_duplicate'] == 0]['ngramms'], label='zeros')
plt.legend();

# Validation scheme

In [None]:
#compare 4 chars from name_1 and name_2
srez = 4
train['4_str'] = train['name_1'].str[:srez]
mask = train[train['name_1'].str[:srez] == train['name_2'].str[:srez]].copy()

#dict for unique values
dd = dict(zip(train['4_str'].unique(), np.arange(len(train['4_str'].unique()))))
train['4_str'] = train['4_str'].map(dd)
#for substr in train['4_str'].value_counts():
    

# Catboost

In [None]:
columns = ['levenstein', 'norm_levenstein', 'jaccard', 'ngramms']
split = StratifiedShuffleSplit(1, train_size=0.8, random_state=42)
tridx, cvidx = list(split.split(train[columns], train["is_duplicate"]))[0]

In [None]:
X_train = train.iloc[tridx][columns].values
y_train = train.iloc[tridx]['is_duplicate'].values
train_data = Pool(X_train, y_train)

In [None]:
X_valid = train.iloc[cvidx][columns].values
y_valid = train.iloc[cvidx]['is_duplicate'].values
valid_data = Pool(X_valid, y_valid)

In [None]:
params = {"iterations": 3000,
#          "depth": 2,
          "loss_function": "CrossEntropy",
          "verbose": False,
          "eval_metric": "F1",
          "random_seed": 42,
          "learning_rate": 0.8,
#          "auto_class_weights": 'Balanced',
          "use_best_model": True
#          "l2_leaf_reg": 1e12
          }

model = CatBoostClassifier(**params)
model.fit(train_data, plot=True, eval_set=valid_data)

In [None]:
pd.DataFrame(model.get_feature_importance(), index=columns)

In [None]:
preds = model.predict(X_valid)
f1_score(y_valid, preds)

In [None]:
preds = model.predict_proba(X_valid)[:, 1]
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.92]
metrics = []
for thr in thresholds:
    labels = (preds > thr).astype(int)
    print('-' * 10, 'THRESHOLD =', thr, '-' * 10)
    print(classification_report(y_valid, labels))
    print()
    metric = f1_score(y_valid, labels)
    metrics.append(metric)
plt.plot(metrics)

In [None]:
X_train = train[columns].values
y_train = train['is_duplicate'].values
train_data = Pool(X_train, y_train)

In [None]:
params['use_best_model'] = False
model = CatBoostClassifier(**params)
model.fit(train_data, plot=True)

In [None]:
X_test = test[columns].values

In [None]:
def submit(preds, threshold=0.5, filename='submit.csv', dataset=test):
    labels = (preds > threshold).astype(int)
    result = pd.DataFrame({'pair_id': dataset.index,
                           'is_duplicate': labels})
    print(f'Число положительных классов для threshold={threshold}: {result["is_duplicate"].sum()} / {result["is_duplicate"].mean():.2%}')
    result.to_csv(filename, index=False)
    print('Done!')

In [None]:
preds = model.predict_proba(X_test)[:, 1]
for thr in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    filename = f'submit_{thr}.csv'
    submit(preds, threshold=thr, filename=filename, dataset=test)

# CV

In [None]:
def preprocess(X_train, X_test, scaler, vectorizer):
    X_train[:, 2:] = scaler.fit_transform(X_train[:, 2:])
    X_test[:, 2:] = scaler.transform(X_test[:, 2:])
    
    vectorizer.fit(X_train[:, 0] + ' ' + X_train[:, 1])
    X_train_ngramms = np.absolute(vectorizer.transform(X_train[:, 0]) - vectorizer.transform(X_train[:, 1]))
    X_test_ngramms = np.absolute(vectorizer.transform(X_test[:, 0]) - vectorizer.transform(X_test[:, 1]))
    
    X_train = sparse.csr_matrix(X_train[:, 2:].astype(np.float))
    X_test = sparse.csr_matrix(X_test[:, 2:].astype(np.float))
    
    X_train = sparse.hstack([X_train_ngramms, X_train])
    X_test = sparse.hstack([X_test_ngramms, X_test])
    
    return X_train, X_test

In [None]:
FOLDS = 3

sss = StratifiedShuffleSplit(FOLDS, train_size=0.8, random_state=42)
columns = ['name_1', 'name_2', 'levenstein', 'norm_levenstein', 'jaccard', 'ngramms']
X = train[columns].values
y = train['is_duplicate'].values

vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,5), max_features=5000)
scaler = StandardScaler()
Cs = [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 5e2]
result = pd.DataFrame()
fold = 1
for train_index, test_index in tqdm(sss.split(X, y), total=FOLDS):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train, X_test = preprocess(X_train, X_test, scaler, vectorizer)
    
    for c in Cs:
        lr = LogisticRegression(random_state=42, class_weight='balanced',
                                n_jobs=-1, C=c).fit(X_train, y_train)
        preds = lr.predict(X_test)
        score = f1_score(y_test, preds)
        result.loc[fold, c] = score
    fold += 1        

result.columns.name = 'C'
result.index.name= 'Fold'
result

In [None]:
result

# Submit

In [None]:
columns = ['name_1', 'name_2', 'levenstein', 'norm_levenstein', 'jaccard', 'ngramms']
X_train = train[columns].values
y_train = train['is_duplicate'].values

X_test = test.values

X_train, X_test = preprocess(X_train, X_test, scaler, vectorizer)

In [None]:
model = LogisticRegression(random_state=42, verbose=True, class_weight='balanced', C=100,
                           n_jobs=-1).fit(X_train, y_train)

In [None]:
preds = model.predict_proba(X_test)[:, 1]
for thr in [0.5, 0.6, 0.7, 0.8, 0.9]:
    filename = f'submit_{thr}.csv'
    submit(preds, threshold=thr, filename=filename)