In [1]:
import datetime, itertools, sys
import pandas as ps
from subprocess import Popen, PIPE
from multiprocessing import Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from scipy.cluster.hierarchy import ward, fcluster
import matplotlib.pyplot as plt
from pymystem3 import Mystem
import nltk

In [2]:
data = ps.read_csv("data/spelled-cl.csv", sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
data = data[~data["cluster"].isin(["-", "S", "Standard "])]
print("Число записей в таблице:", len(data))

Число записей в таблице: 32317


## Подбор оптимального порога для каждого дня

In [2]:
def do_stem(df):
    mystem = Mystem()
    messages = ["".join(mystem.lemmatize(row["title"] + ". " + row["text"])) for _, row in df.iterrows()]
    nouns_sets = [get_message_nouns(msg, mystem) for msg in messages]
    clusters = [row["cluster"] for _, row in df.iterrows()]
    mystem.close()
    return messages, nouns_sets, clusters

In [3]:
stop_words = set(nltk.corpus.stopwords.words('russian'))

def get_message_nouns(msg, mystem):
    nouns = set()
    words = mystem.analyze(msg)
    for w in words:
        if "analysis" not in w:
            continue
        if w["analysis"] and "S" in w["analysis"][0]["gr"]:
            lex = w["analysis"][0]["lex"]
            if lex and len(lex) > 2 and lex not in stop_words:
                nouns.add(lex)
        elif not w["analysis"] and len(w["text"]) > 2:
            nouns.add(w["text"].lower())
    return nouns

In [4]:
def get_optimal_threshold(data_1d, clsize=None):
    if type(data_1d) is tuple:
        data_1d, clsize = data_1d
    messages, noun_sets, clusters = do_stem(data_1d)
    tfidf_vectorizer = TfidfVectorizer(min_df=2, max_features=10000,
        stop_words=nltk.corpus.stopwords.words('russian'))
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    dist_matrix = 1 - cosine_similarity(tfidf_matrix)
    linkage_matrix = ward(dist_matrix)
    th, res = optimise_threshold(linkage_matrix, noun_sets, clusters, clsize)
    return th, res

In [5]:
def optimise_threshold(linkage_matrix, noun_sets, clusters, clsize=None):
    scores = {}
    for i in range(10, 21):
        th = i/10
        labels = get_labels(linkage_matrix, noun_sets, clsize, th)
        res = marks_to_pairwise(clusters, labels)
        _, _, fscore = get_prec_recall_f(res, 1)
        fscore = int(fscore*10000)/10000
        scores[th] = fscore
    fscore_m, th_m = max((f, th) for th, f in scores.items())
    max_fscores = sorted([(f, th) for th, f in scores.items() if f == fscore_m])
    th_m = max_fscores[len(max_fscores)//2][1]
    for i in range(int(th_m*100) - 9, int(th_m*100) + 10):
        th = i/100
        labels = get_labels(linkage_matrix, noun_sets, clsize, th)
        res = marks_to_pairwise(clusters, labels)
        _, _, fscore = get_prec_recall_f(res, 1)
        fscore = int(fscore*10000)/10000
        scores[th] = fscore
    fscore_m, th_m = max((f, th) for th, f in scores.items())
    max_fscores = sorted([(f, th) for th, f in scores.items() if f == fscore_m])
    th_m = max_fscores[len(max_fscores)//2][1]
    return th_m, marks_to_pairwise(clusters, get_labels(linkage_matrix, noun_sets, clsize, th_m))

In [6]:
def marks_to_pairwise(y_cls, p_cls):
    assert len(y_cls) == len(p_cls)
    res = {"ids": [], "y": [], "p": []}
    for i1, i2 in itertools.combinations(range(len(y_cls)), 2):
        res["ids"].append(sorted((i1, i2)))
        res["y"].append(bool(y_cls[i1] == y_cls[i2]))
        res["p"].append(bool(p_cls[i1] == p_cls[i2]))
    return ps.DataFrame(res, index=None)

In [7]:
def get_prec_recall_f(res, cls):
    pred = res[res["p"] == cls]
    if len(pred) == 0:
        return 0, 0, 0
    prec = len(pred[pred["y"] == cls]) / len(pred)
    act = res[res["y"] == cls]
    if len(act) == 0:
        return 0, 0, 0
    rec = len(act[act["p"] == cls]) / len(act)
    return prec, rec, 2/(1/prec + 1/rec)

In [8]:
def get_labels(linkage_matrix, noun_sets, max_cls_size=None, cutoff=None):
    n_samples = len(noun_sets)
    clusters = {i: [i] for i in range(n_samples)}
    for i, row in enumerate(linkage_matrix):
        c1, c2, dist, cls_size = row
        c1, c2 = int(c1), int(c2)
        if cutoff and dist > cutoff:
            break
        if not clusters[c1] or not clusters[c2]:
            clusters[n_samples+i] = []
            continue
        
        all_indexes = clusters[c1] + clusters[c2]
        all_nouns = [noun_sets[i] for i in all_indexes]
        common_nouns = all_nouns[0]
        for nouns in all_nouns[1:]:
            common_nouns &= nouns
        if not common_nouns:
            clusters[n_samples+i] = []
            continue
        
        if not (max_cls_size and cls_size > max_cls_size):
            clusters[n_samples+i] = clusters[c1] + clusters[c2]
            del clusters[c1]
            del clusters[c2]
        else:
            clusters[n_samples+i] = []
    
    labels = [None] * n_samples
    for cls_num, objects in clusters.items():
        for o in objects:
            labels[o] = cls_num
    return labels

In [9]:
def cross_class_report(res):
    classes = res["y"].unique()
    table = ps.DataFrame(index=classes, columns=classes)
    for true_cls in classes:
        tmp = res[res["y"] == true_cls]
        for pred_cls in classes:
            table[pred_cls][true_cls] = len(tmp[tmp["p"] == pred_cls])
    return table

In [10]:
def multiprocess_optimise_th_for_days(data, days, clsize=None):
    pool = Pool()
    day_data = [data[data["date"] == day] for day in days]
    th_res = pool.map(get_optimal_threshold, [(d, clsize) for d in day_data])
    pool.close()
    results = {day: (th, res) for (th, res), day in zip(th_res, days)}
    return results

In [12]:
day_msg_count = {day: len(df) for day, df in data.groupby("date")}
all_days = [day for day, cnt in day_msg_count.items() if 10 < cnt < 1000]
print("Число дней:", len(all_days))

Число дней: 41


In [13]:
results = multiprocess_optimise_th_for_days(data, all_days)
for day in sorted(results.keys(), key=lambda x: day_msg_count[x]):
    th, res = results[day]
    print("День: {} ({} сообщений)".format(day, day_msg_count[day]))
    print("Порог:", th)
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")

День: 2015-10-29 (11 сообщений)
Порог: 1.2
             precision    recall  f1-score   support

      False       0.94      1.00      0.97        48
       True       1.00      0.57      0.73         7

avg / total       0.95      0.95      0.94        55

      False True 
False    48     0
True      3     4 


День: 2015-11-29 (15 сообщений)
Порог: 1.0
             precision    recall  f1-score   support

      False       0.97      1.00      0.98        96
       True       1.00      0.67      0.80         9

avg / total       0.97      0.97      0.97       105

      False True 
False    96     0
True      3     6 


День: 2015-01-23 (17 сообщений)
Порог: 1.2
             precision    recall  f1-score   support

      False       1.00      0.98      0.99       132
       True       0.67      1.00      0.80         4

avg / total       0.99      0.99      0.99       136

      False True 
False   130     2
True      0     4 


День: 2015-08-28 (17 сообщений)
Порог: 1.5
            

In [11]:
data_cl = ps.read_csv("data/day-1432-cl.csv", sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
th, res = get_optimal_threshold(data_cl)
print("Порог:", th)
print(classification_report(res["y"], res["p"]))
print(cross_class_report(res), "\n\n")

Порог: 1.76
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1014749
       True       0.63      0.77      0.69      1276

avg / total       1.00      1.00      1.00   1016025

      True     False
True    981      295
False   578  1014171 




In [14]:
res = get_clusterization_results(data_cl, 1.5)
print(classification_report(res["y"], res["p"]))
print(cross_class_report(res), "\n\n")

             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1014749
       True       0.76      0.61      0.67      1276

avg / total       1.00      1.00      1.00   1016025

      True     False
True    777      499
False   250  1014499 




## Многопроцессная обработка с фиксированным порогом

In [11]:
def multiprocess_apply_th(data, days, th=None, clsize=None):
    pool = Pool()
    day_data = [data[data["date"] == day] for day in days]
    results = pool.map(get_clusterization_results, [(d, th, clsize) for d in day_data])
    pool.close()
    return {day: res for day, res in zip(days, results)}

In [13]:
def get_clusterization_results(data_1d, th=None, clsize=None):
    if type(data_1d) is tuple:
        data_1d, th, clsize = data_1d
    messages, noun_sets, clusters = do_stem(data_1d)
    tfidf_vectorizer = TfidfVectorizer(min_df=2, max_features=10000,
        stop_words=nltk.corpus.stopwords.words('russian'))
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    dist_matrix = 1 - cosine_similarity(tfidf_matrix)
    linkage_matrix = ward(dist_matrix)
    labels = get_labels(linkage_matrix, noun_sets, clsize, th)
    res = marks_to_pairwise(clusters, labels)
    return res

In [13]:
results = multiprocess_apply_th(data, all_days, th=2.0)
for day in sorted(results.keys(), key=lambda x: day_msg_count[x]):
    res = results[day]
    print("День: {} ({} сообщений)".format(day, day_msg_count[day]))
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")

NameError: name 'data' is not defined

## Выбор дней из полной выборки

In [14]:
data_all = ps.read_csv("data/120k-utf8.csv", sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
data_all = data_all[~data_all["cluster"].isin(["S", "Standard "])]
print("Число записей в таблице:", len(data_all))

Число записей в таблице: 117751


In [15]:
no_cluster_index = 0
for i, ind in enumerate(data_all[data_all["cluster"] == "-"].index):
    data_all.loc[ind]["cluster"] = "NONE_{}".format(i)

In [16]:
days_cnt = {}
for day in data_all["date"].unique():
    cnt = (data_all["date"] == day).value_counts()[True]
    if 1000 < cnt < 5000:
        days_cnt[day] = cnt
for day, cnt in sorted(days_cnt.items(), key=lambda x: x[1]):
    print(day, cnt)

2015-06-25 1120
2015-03-25 1127
2015-05-27 1137
2015-03-29 1145
2015-02-25 1158
2015-02-26 1159
2015-05-25 1164
2015-05-29 1183
2015-02-27 1191
2015-05-26 1212
2015-02-28 1432
2015-05-30 1465
2016-01-28 4146


In [18]:
days = ["2015-02-28", "2016-01-28"]
results = multiprocess_apply_th(data_all, days, th=2.0)
for day in sorted(results.keys(), key=lambda x: days_cnt[x]):
    res = results[day]
    print("День: {} ({} сообщений)".format(day, days_cnt[day]))
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")

Process ForkPoolWorker-7:
Process ForkPoolWorker-5:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
    res = self._reader.recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._ar

KeyboardInterrupt: 

  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
KeyboardInterrupt


In [17]:
days = ["2015-02-28", "2016-01-28"]
results = multiprocess_apply_th(data_all, days, th=3.0)
for day in sorted(results.keys(), key=lambda x: days_cnt[x]):
    res = results[day]
    print("День: {} ({} сообщений)".format(day, days_cnt[day]))
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")

День: 2015-02-28 (1432 сообщений)
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1024181
       True       0.10      0.87      0.18       415

avg / total       1.00      1.00      1.00   1024596

         False True 
False  1020935  3246
True        56   359 


День: 2016-01-28 (4146 сообщений)
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   8591695
       True       0.07      0.83      0.14       890

avg / total       1.00      1.00      1.00   8592585

         False True 
False  8582598  9097
True       154   736 




## Анализ ошибок кластеризации

In [18]:
def export_clusterization_report(data_1d, cutoff, filename):
    messages, noun_sets, clusters = do_stem(data_1d)
    tfidf_vectorizer = TfidfVectorizer(min_df=2, max_features=10000,
        stop_words=nltk.corpus.stopwords.words('russian'))
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    dist_matrix = 1 - cosine_similarity(tfidf_matrix)
    linkage_matrix = ward(dist_matrix)
    labels = get_labels(linkage_matrix, noun_sets, cutoff=cutoff)
    
    res = marks_to_pairwise(clusters, labels)
    print("ДЕНЬ: {}\nСообщений: {}".format(data_1d.iloc[0]["date"], len(data_1d)))
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")
    
    texts = [row["title"] + ". " + row["text"] for _, row in data_1d.iterrows()]
    results = ps.DataFrame({"message": texts, "y": clusters, "p": labels}, index=None)
    with open(filename, "w") as f:
        for _, df in results.groupby("p"):
            if len(df["y"].unique()) == 1:
                continue
            df = df.sort_values("y")
            cluster_letter = {c: l for c, l in zip(sorted(df["y"].unique()), "абвгдежзиклмно")}
            f.write("Участвующие кластеры:\n")
            for c, l in sorted(cluster_letter.items(), key=lambda x: x[0]):
                f.write("  {}) '{}'\n".format(l, c))
            f.write("Сообщения:\n")
            for _, row in df.iterrows():
                f.write("  ({}) {}\n".format(cluster_letter[row["y"]], row["message"]))
            f.write("\n")

In [20]:
days = ["2015-02-28", "2016-01-28"]
for day in days:
    export_clusterization_report(data_all[data_all["date"] == day], 1.75,
                                 "reports-full/report-full-{}.txt".format(day))

ДЕНЬ: 2015-02-28
Сообщений: 1432
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1024181
       True       0.20      0.77      0.32       415

avg / total       1.00      1.00      1.00   1024596

         False True 
False  1022937  1244
True        97   318 


ДЕНЬ: 2016-01-28
Сообщений: 4146
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   8591695
       True       0.24      0.68      0.36       890

avg / total       1.00      1.00      1.00   8592585

         False True 
False  8589801  1894
True       287   603 




In [23]:
for day in days:
    data_1d = data_all[data_all["date"] == day].copy()
    data_1d.sort_values("cluster", inplace=True)
    data_1d.to_csv("data/day-{}.csv".format(len(data_1d)), sep=';', header=None)

In [19]:
for filename in ["day-1432.csv", "day-4146.csv"]:
    data_1d = ps.read_csv("data/" + filename, sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
    export_clusterization_report(data_1d, 1.75,
                                 "reports-full/report-{}.txt".format(filename.split(".")[0]))

ДЕНЬ: 2015-02-28
Сообщений: 1432
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1024181
       True       0.20      0.77      0.32       415

avg / total       1.00      1.00      1.00   1024596

      True     False
True    318       97
False  1244  1022937 


ДЕНЬ: 2016-01-28
Сообщений: 4146
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   8591695
       True       0.24      0.68      0.36       890

avg / total       1.00      1.00      1.00   8592585

         False True 
False  8589801  1894
True       287   603 


