In [1]:
import codecs
import pandas as pd
import pylab as pl
import numpy as np
import re
import scipy.spatial as ss
import sklearn.cluster as sc
import sklearn.manifold as sm
import sklearn.metrics as smt

from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from pymystem3 import Mystem
from sklearn.base import BaseEstimator, ClusterMixin


In [2]:
stopwords = stopwords.words('russian')
mystem = Mystem()
corpus = ['']
for i in range(1, 28027):
    filename = str(i)+'.dat'
    with codecs.open('content/' + filename, 'r', 'utf-8') as f:
        url = f.readline().strip()
        html = BeautifulSoup(f)
        corpus.append(' '.join([mystem.lemmatize(word)[0] for word in re.sub(r'\n|[^а-я]',' ', re.sub('([А-Я]{1})', r' \1', html.title.text).lower()).split()  if word not in stopwords and len(word) > 1]))
        print(f'\r{i}/{28026}', end='', flush=True)


28026/28026


In [3]:
train_data = pd.read_csv('anomaly-detection-competition-ml1-ts-spring-2020/train_groups.csv')
traingroups_data = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    text = corpus[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_data[doc_group] = []
    traingroups_data[doc_group].append((doc_id, text, target))
test_data = pd.read_csv('anomaly-detection-competition-ml1-ts-spring-2020/test_groups.csv')
all_data = train_data.merge(test_data, 'outer')


In [4]:
count = Counter()
size = len(corpus)
for i, text in enumerate(corpus):
    count.update(set(text.split()))
    print(f'\r{i + 1}/{len(corpus)}', end='', flush=True)


28026/28026


In [5]:
idf = {}
for i, (k, v) in enumerate(count.items()):
    idf[k] = np.log10(size / v)
    print(f'\r{i + 1}/{len(count)}', end='', flush=True)


27346/27346


In [6]:
tf_idf = {}
for doc, text in enumerate(corpus):
    tf = Counter(text.split())
    tf_idf[doc] = {k: v*idf[k] for k, v in tf.items()}
    print(f'\r{doc}/{28026}', end='', flush=True)


28026/28026


In [7]:
vocab = set()
for i, title in enumerate(corpus):
    vocab = vocab.union(title.split())
    print(f'\r{i+1}/{len(corpus)}', end='', flush=True)


28026/28026


In [8]:
tokens = {}
for i, word in enumerate(vocab):
    tokens[word] = i
    print(f'\r{i+1}/{len(vocab)}', end='', flush=True)


27346/27346


In [9]:
docs = np.array(list(range(len(corpus)-1)))
T = np.zeros(((len(docs)+1), len(vocab)))
for doc_id, vect in enumerate(T):
    if doc_id == 0:
        continue
    for k, v in tf_idf[doc_id].items():
        vect[tokens[k]] = v
    print(f'\r{doc_id}/{len(docs)}', end='', flush=True)


28026/28026


In [10]:
class Clustering(BaseEstimator, ClusterMixin):

def __init__(self, metric='euclidean', linkage='complete', **kwargs):
    self.metric = metric
    self.linkage = linkage

def fit(self, x):
    D = ss.distance.squareform(ss.distance.pdist(x, metric=self.metric))
    D_max = np.ceil(D.max())
    np.fill_diagonal(D, D_max)
    c_s = np.ones(D.shape[0], dtype='int64')
    path = []
    link = []
    history = []
    new_cl = np.arange(len(x))
    for k in range(len(x) - 1):
        ind = np.unravel_index(D.argmin(), D.shape)
        c_1, c_2 = ind
        if self.linkage == 'single':
            a_i, a_j, b, c = 1/2, 1/2, 0, -1/2
        elif self.linkage == 'complete':
            a_i, a_j, b, c = 1/2, 1/2, 0, 1/2
        elif self.linkage == 'average':
            a_i, a_j, b, c = c_s[c_1] / (c_s[c_1] + c_s[c_2]), c_s[c_2] / (c_s[c_1] + c_s[c_2]), 0, 0
        new_d = a_i * D[c_1,:] + a_j * D[c_2,:] + b * D[c_1, c_2] + c * np.abs(D[c_1,:] - D[c_2,:])
        D[c_1], D[:, c_1] = new_d, new_d
        D[c_2], D[:, c_2], D[c_1, c_1] = D_max, D_max, D_max
        c_s[c_1] += c_s[c_2].copy()
        c_s[c_2] = 0
        path.append([c_1, c_2])
        link.append([min(new_cl[c_1], new_cl[c_2]), max(new_cl[c_1], new_cl[c_2]), D[ind], c_s[c_1]])
        history.append([c_1, c_2, dist, c_s[c_1]])
        new_cl[[c_1,c_2]] = len(x) + k
    self.path = np.array(path)
    self.link = np.array(link)
    self.history = np.array(history)


In [11]:
groups = all_data.group_id.unique()
size = len(groups)
story = {}
preds = []
for g, gr in enumerate(groups):
    docs = all_data.doc_id[all_data.group_id == gr]
    X = T[docs]
    norm = np.linalg.norm(X, axis=1)
    mask = np.isclose(norm, 0)
    norm[mask] = 1
    X = X / norm.reshape(-1,1)
    X[mask] = 0
    kmeans = KMeans(n_clusters=2)
    result = kmeans.fit_predict(X)
    ind = np.argmax(np.bincount(result))
    y_pred = np.zeros(len(X))
    y_pred[result == ind] = 1
    preds += list(y_pred)
    print(f'{(g+1)}/{size} Complete', end='', flush=True)


309/309


In [12]:
preds = np.array(preds)
target = all_data.target[all_data.group_id<130]
print(f1_score(target, 1-preds[:11690]))


0.42169408897014016


In [13]:
output = pd.DataFrame({'pair_id': test_data.pair_id, 'target': 1-preds[11690:]})
output.to_csv('submission.csv', index=False)
print('Your submission was successfully saved!')


Your submission was successfully saved!
