In [1]:
import datetime, itertools, sys
from functools import partial
import pandas as ps
from subprocess import Popen, PIPE
from multiprocessing import Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from scipy.cluster.hierarchy import ward, fcluster
import matplotlib.pyplot as plt
from pymystem3 import Mystem
import nltk

In [2]:
data = ps.read_csv("data/spelled-cl.csv", sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
data = data[~data["cluster"].isin(["-", "S", "Standard "])]
print("Число записей в таблице:", len(data))

Число записей в таблице: 32317


In [3]:
data_1d = data[data["date"] == "2015-05-30"]
print(len(data_1d))

523


## Подбор оптимального порога для каждого дня

In [2]:
def do_stem(df):
    mystem = Mystem()
    messages = ["".join(mystem.lemmatize(row["title"] + ". " + row["text"])) for _, row in df.iterrows()]
    clusters = [row["cluster"] for _, row in df.iterrows()]
    mystem.close()
    return messages, clusters

In [3]:
def get_optimal_threshold(data_1d, clsize=None):
    if type(data_1d) is tuple:
        data_1d, clsize = data_1d
    messages, clusters = do_stem(data_1d)
    
    if len(data_1d) > 100:
        max_df = 0.1
    else:
        max_df = 1.0
    
    tfidf_vectorizer = TfidfVectorizer(min_df=2, max_features=10000, max_df=max_df,
        stop_words=nltk.corpus.stopwords.words('russian'))
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    dist_matrix = 1 - cosine_similarity(tfidf_matrix)
    linkage_matrix = ward(dist_matrix)
    th, res = optimise_threshold(linkage_matrix, clusters, clsize)
    return th, res

In [4]:
def optimise_threshold(linkage_matrix, clusters, clsize=None):
    scores = {}
    for i in range(10, 21):
        th = i/10
        labels = get_labels(linkage_matrix, len(clusters), clsize, th)
        res = marks_to_pairwise(clusters, labels)
        _, _, fscore = get_prec_recall_f(res, 1)
        fscore = int(fscore*10000)/10000
        scores[th] = fscore
    fscore_m, th_m = max((f, th) for th, f in scores.items())
    max_fscores = sorted([(f, th) for th, f in scores.items() if f == fscore_m])
    th_m = max_fscores[len(max_fscores)//2][1]
    for i in range(int(th_m*100) - 9, int(th_m*100) + 10):
        th = i/100
        labels = get_labels(linkage_matrix, len(clusters), clsize, th)
        res = marks_to_pairwise(clusters, labels)
        _, _, fscore = get_prec_recall_f(res, 1)
        fscore = int(fscore*10000)/10000
        scores[th] = fscore
    fscore_m, th_m = max((f, th) for th, f in scores.items())
    max_fscores = sorted([(f, th) for th, f in scores.items() if f == fscore_m])
    th_m = max_fscores[len(max_fscores)//2][1]
    return th_m, marks_to_pairwise(clusters, get_labels(linkage_matrix, len(clusters), clsize, th_m))

In [5]:
def marks_to_pairwise(y_cls, p_cls):
    assert len(y_cls) == len(p_cls)
    res = {"ids": [], "y": [], "p": []}
    for i1, i2 in itertools.combinations(range(len(y_cls)), 2):
        res["ids"].append(sorted((i1, i2)))
        res["y"].append(bool(y_cls[i1] == y_cls[i2]))
        res["p"].append(bool(p_cls[i1] == p_cls[i2]))
    return ps.DataFrame(res, index=None)

In [6]:
def get_prec_recall_f(res, cls):
    pred = res[res["p"] == cls]
    if len(pred) == 0:
        return 0, 0, 0
    prec = len(pred[pred["y"] == cls]) / len(pred)
    act = res[res["y"] == cls]
    if len(act) == 0:
        return 0, 0, 0
    rec = len(act[act["p"] == cls]) / len(act)
    return prec, rec, 2/(1/prec + 1/rec)

In [7]:
def get_labels(linkage_matrix, n_samples, max_cls_size=None, cutoff=None):
    clusters = {i: [i] for i in range(n_samples)}
    for i, row in enumerate(linkage_matrix):
        c1, c2, dist, cls_size = row
        if cutoff and dist > cutoff:
            break
        if not (max_cls_size and cls_size > max_cls_size):
            clusters[n_samples+i] = clusters[c1] + clusters[c2]
            del clusters[c1]
            del clusters[c2]
    labels = [None] * n_samples
    for cls_num, objects in clusters.items():
        for o in objects:
            labels[o] = cls_num
    return labels

In [8]:
def cross_class_report(res):
    classes = res["y"].unique()
    table = ps.DataFrame(index=classes, columns=classes)
    for true_cls in classes:
        tmp = res[res["y"] == true_cls]
        for pred_cls in classes:
            table[pred_cls][true_cls] = len(tmp[tmp["p"] == pred_cls])
    return table

In [9]:
def multiprocess_optimise_th_for_days(days, clsize=None):
    pool = Pool()
    day_data = [data[data["date"] == day] for day in days]
    th_res = pool.map(get_optimal_threshold, [(d, clsize) for d in day_data])
    pool.close()
    results = {day: (th, res) for (th, res), day in zip(th_res, days)}
    return results

In [11]:
day_msg_count = {day: len(df) for day, df in data.groupby("date")}
all_days = [day for day, cnt in day_msg_count.items() if 10 < cnt < 1000]
print("Число дней:", len(all_days))

Число дней: 41


In [12]:
results = multiprocess_optimise_th_for_days(all_days, 3)
for day in sorted(results.keys(), key=lambda x: day_msg_count[x]):
    th, res = results[day]
    print("День: {} ({} сообщений)".format(day, day_msg_count[day]))
    print("Порог:", th)
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")

День: 2015-10-29 (11 сообщений)
Порог: 1.3
             precision    recall  f1-score   support

      False       0.96      1.00      0.98        48
       True       1.00      0.71      0.83         7

avg / total       0.97      0.96      0.96        55

      False True 
False    48     0
True      2     5 


День: 2015-11-29 (15 сообщений)
Порог: 1.06
             precision    recall  f1-score   support

      False       0.98      0.99      0.98        96
       True       0.88      0.78      0.82         9

avg / total       0.97      0.97      0.97       105

      False True 
False    95     1
True      2     7 


День: 2015-08-28 (17 сообщений)
Порог: 1.5
             precision    recall  f1-score   support

      False       1.00      1.00      1.00       123
       True       1.00      1.00      1.00        13

avg / total       1.00      1.00      1.00       136

      False True 
False   123     0
True      0    13 


День: 2015-01-23 (17 сообщений)
Порог: 1.1
           

In [19]:
results = multiprocess_optimise_th_for_days(all_days)
for day in sorted(results.keys(), key=lambda x: day_msg_count[x]):
    th, res = results[day]
    print("День: {} ({} сообщений)".format(day, day_msg_count[day]))
    print("Порог:", th)
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")

День: 2015-10-29 (11 сообщений)
Порог: 1.3
             precision    recall  f1-score   support

      False       0.96      1.00      0.98        48
       True       1.00      0.71      0.83         7

avg / total       0.97      0.96      0.96        55

      False True 
False    48     0
True      2     5 


День: 2015-11-29 (15 сообщений)
Порог: 1.06
             precision    recall  f1-score   support

      False       0.98      0.99      0.98        96
       True       0.88      0.78      0.82         9

avg / total       0.97      0.97      0.97       105

      False True 
False    95     1
True      2     7 


День: 2015-08-28 (17 сообщений)
Порог: 1.5
             precision    recall  f1-score   support

      False       1.00      1.00      1.00       123
       True       1.00      1.00      1.00        13

avg / total       1.00      1.00      1.00       136

      False True 
False   123     0
True      0    13 


День: 2015-01-23 (17 сообщений)
Порог: 1.1
           

In [10]:
data_cl = ps.read_csv("data/day-1432-cl.csv", sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
th, res = get_optimal_threshold(data_cl)
print("Порог:", th)
print(classification_report(res["y"], res["p"]))
print(cross_class_report(res), "\n\n")

Порог: 1.44
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1014749
       True       0.82      0.59      0.69      1276

avg / total       1.00      1.00      1.00   1016025

      True     False
True    758      518
False   164  1014585 




## Анализ ошибок кластеризации

In [9]:
def export_clusterization_report(data_1d, cutoff, filename):
    messages, clusters = do_stem(data_1d)
    tfidf_vectorizer = TfidfVectorizer(min_df=2, max_features=10000,
        stop_words=nltk.corpus.stopwords.words('russian'))
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    dist_matrix = 1 - cosine_similarity(tfidf_matrix)
    linkage_matrix = ward(dist_matrix)
    labels = get_labels(linkage_matrix, len(clusters), cutoff=cutoff)
    
    res = marks_to_pairwise(clusters, labels)
    print("ДЕНЬ: {}\nСообщений: {}".format(day, len(data_1d)))
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")
    
    texts = [row["title"] + ". " + row["text"] for _, row in data_1d.iterrows()]
    results = ps.DataFrame({"message": texts, "y": clusters, "p": labels}, index=None)
    with open(filename, "w") as f:
        for _, df in results.groupby("p"):
            if len(df["y"].unique()) == 1:
                continue
            df = df.sort_values("y")
            cluster_letter = {c: l for c, l in zip(sorted(df["y"].unique()), "абвгдежзиклмно")}
            f.write("Участвующие кластеры:\n")
            for c, l in sorted(cluster_letter.items(), key=lambda x: x[0]):
                f.write("  {}) '{}'\n".format(l, c))
            f.write("Сообщения:\n")
            for _, row in df.iterrows():
                f.write("  ({}) {}\n".format(cluster_letter[row["y"]], row["message"]))
            f.write("\n")

In [17]:
for day in data["date"].unique():
    data_1d = data[data["date"] == day]
    if len(data_1d) < 4000:        
        export_clusterization_report(data_1d, 1.75, "reports/report-{}.txt".format(day))

ДЕНЬ: 2015-01-22
Сообщений: 2
             precision    recall  f1-score   support

       True       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         1

     True
True    1 


ДЕНЬ: 2015-01-23
Сообщений: 17
             precision    recall  f1-score   support

      False       1.00      0.86      0.93       132
       True       0.18      1.00      0.31         4

avg / total       0.98      0.87      0.91       136

      False True 
False   114    18
True      0     4 


ДЕНЬ: 2015-01-24
Сообщений: 22
             precision    recall  f1-score   support

      False       1.00      0.93      0.96       224
       True       0.30      1.00      0.47         7

avg / total       0.98      0.93      0.95       231

      False True 
False   208    16
True      0     7 


ДЕНЬ: 2015-02-25
Сообщений: 279
             precision    recall  f1-score   support

      False       1.00      1.00      1.00     38577
       True       0.84      0.96      0.

In [None]:
for filename in ["day-1432.csv", "day-4146.csv"]:
    data_1d = ps.read_csv("data/" + filename, sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
    