In [1]:
import datetime, itertools, sys
import pandas as ps
from subprocess import Popen, PIPE
from multiprocessing import Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from scipy.cluster.hierarchy import ward, fcluster
import matplotlib.pyplot as plt
from pymystem3 import Mystem
import nltk

In [2]:
data = ps.read_csv("data/spelled-f.csv", sep=';', header=None,
                   index_col=0,names=['id','title','text','cluster','date','publisher'])
data = data[~data["cluster"].isin(["-", "S", "Standard "])]
print("Число записей в таблице:", len(data))

Число записей в таблице: 32317


## LSA для одного дня

In [3]:
def do_stem(df):
    mystem = Mystem()
    messages = ["".join(mystem.lemmatize(row["title"] + ". " + row["text"])) for _, row in df.iterrows()]
    clusters = [row["cluster"] for _, row in df.iterrows()]
    mystem.close()
    return messages, clusters

In [4]:
def get_lsa_matrix(data_1d, n_features=1500):
    messages, clusters = do_stem(data_1d)
    tfidf_vectorizer = TfidfVectorizer(min_df=2, max_features=10000,
        stop_words=nltk.corpus.stopwords.words('russian'))
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    
    if tfidf_matrix.shape[1] <= n_features:
        return tfidf_matrix, clusters
    
    svd = TruncatedSVD(n_features)
    lsa = make_pipeline(svd, Normalizer(copy=False))
    lsa_matrix = lsa.fit_transform(tfidf_matrix)
    return lsa_matrix, clusters

In [5]:
def marks_to_pairwise(y_cls, p_cls):
    assert len(y_cls) == len(p_cls)
    res = {"ids": [], "y": [], "p": []}
    for i1, i2 in itertools.combinations(range(len(y_cls)), 2):
        res["ids"].append(sorted((i1, i2)))
        res["y"].append(bool(y_cls[i1] == y_cls[i2]))
        res["p"].append(bool(p_cls[i1] == p_cls[i2]))
    return ps.DataFrame(res, index=None)

In [6]:
def cross_class_report(res):
    classes = res["y"].unique()
    table = ps.DataFrame(index=classes, columns=classes)
    for true_cls in classes:
        tmp = res[res["y"] == true_cls]
        for pred_cls in classes:
            table[pred_cls][true_cls] = len(tmp[tmp["p"] == pred_cls])
    return table

In [14]:
data_1d = data[data["date"] == "2016-02-05"]
lsa_matrix, clusters = get_lsa_matrix(data_1d, 1500)
dist_matrix = 1 - cosine_similarity(lsa_matrix)
linkage_matrix = ward(dist_matrix)

In [16]:
labels = fcluster(linkage_matrix, 2, criterion='distance')
res = marks_to_pairwise(clusters, labels)
print(classification_report(res["y"], res["p"]))
print(cross_class_report(res), "\n\n")

             precision    recall  f1-score   support

      False       1.00      1.00      1.00   6307362
       True       0.70      0.72      0.71      2766

avg / total       1.00      1.00      1.00   6310128

         False True 
False  6306512   850
True       772  1994 




## Подбор оптимального порога для каждого дня

In [7]:
def multiprocess_optimise_th_for_days(days, clsize=None):
    pool = Pool()
    day_data = [data[data["date"] == day] for day in days]
    th_res = pool.map(get_optimal_threshold, [(d, clsize) for d in day_data])
    pool.close()
    results = {day: (th, res) for (th, res), day in zip(th_res, days)}
    return results

In [8]:
def get_optimal_threshold(data_1d, clsize=None):
    if type(data_1d) is tuple:
        data_1d, clsize = data_1d
    
    lsa_matrix, clusters = get_lsa_matrix(data_1d)
    
    '''
    messages, clusters = do_stem(data_1d)
    
    tfidf_vectorizer = TfidfVectorizer(min_df=2, max_features=10000,
        stop_words=nltk.corpus.stopwords.words('russian'))
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    
    svd = TruncatedSVD(100)
    lsa = make_pipeline(svd, Normalizer(copy=False))
    lsa_matrix = lsa.fit_transform(tfidf_matrix)
    '''
    
    dist_matrix = 1 - cosine_similarity(lsa_matrix)
    linkage_matrix = ward(dist_matrix)
    th, res = optimise_threshold(linkage_matrix, clusters, clsize)
    return th, res

In [9]:
def optimise_threshold(linkage_matrix, clusters, clsize=None):
    scores = {}
    for i in range(10, 21):
        th = i/10
        labels = get_labels(linkage_matrix, len(clusters), clsize, th)
        res = marks_to_pairwise(clusters, labels)
        _, _, fscore = get_prec_recall_f(res, 1)
        fscore = int(fscore*10000)/10000
        scores[th] = fscore
    fscore_m, th_m = max((f, th) for th, f in scores.items())
    max_fscores = sorted([(f, th) for th, f in scores.items() if f == fscore_m])
    th_m = max_fscores[len(max_fscores)//2][1]
    for i in range(int(th_m*100) - 9, int(th_m*100) + 10):
        th = i/100
        labels = get_labels(linkage_matrix, len(clusters), clsize, th)
        res = marks_to_pairwise(clusters, labels)
        _, _, fscore = get_prec_recall_f(res, 1)
        fscore = int(fscore*10000)/10000
        scores[th] = fscore
    fscore_m, th_m = max((f, th) for th, f in scores.items())
    max_fscores = sorted([(f, th) for th, f in scores.items() if f == fscore_m])
    th_m = max_fscores[len(max_fscores)//2][1]
    return th_m, marks_to_pairwise(clusters, get_labels(linkage_matrix, len(clusters), clsize, th_m))

In [10]:
def get_labels(linkage_matrix, n_samples, max_cls_size=None, cutoff=None):
    clusters = {i: [i] for i in range(n_samples)}
    for i, row in enumerate(linkage_matrix):
        c1, c2, dist, cls_size = row
        if cutoff and dist > cutoff:
            break
        if not (max_cls_size and cls_size > max_cls_size):
            clusters[n_samples+i] = clusters[c1] + clusters[c2]
            del clusters[c1]
            del clusters[c2]
    labels = [None] * n_samples
    for cls_num, objects in clusters.items():
        for o in objects:
            labels[o] = cls_num
    return labels

In [11]:
def get_prec_recall_f(res, cls):
    pred = res[res["p"] == cls]
    if len(pred) == 0:
        return 0, 0, 0
    prec = len(pred[pred["y"] == cls]) / len(pred)
    act = res[res["y"] == cls]
    if len(act) == 0:
        return 0, 0, 0
    rec = len(act[act["p"] == cls]) / len(act)
    return prec, rec, 2/(1/prec + 1/rec)

In [13]:
day_msg_count = {day: len(df) for day, df in data.groupby("date")}
all_days = [day for day, cnt in day_msg_count.items() if 10 < cnt < 4000]
print("Число дней:", len(all_days))

Число дней: 44


In [15]:
results = multiprocess_optimise_th_for_days(all_days)
for day in sorted(results.keys(), key=lambda x: day_msg_count[x]):
    th, res = results[day]
    print("День: {} ({} сообщений)".format(day, day_msg_count[day]))
    print("Порог:", th)
    print(classification_report(res["y"], res["p"]))
    print(cross_class_report(res), "\n\n")

День: 2015-10-29 (11 сообщений)
Порог: 1.3
             precision    recall  f1-score   support

      False       0.96      1.00      0.98        48
       True       1.00      0.71      0.83         7

avg / total       0.97      0.96      0.96        55

      False True 
False    48     0
True      2     5 


День: 2015-11-29 (15 сообщений)
Порог: 1.06
             precision    recall  f1-score   support

      False       0.98      0.99      0.98        96
       True       0.88      0.78      0.82         9

avg / total       0.97      0.97      0.97       105

      False True 
False    95     1
True      2     7 


День: 2015-01-23 (17 сообщений)
Порог: 1.1
             precision    recall  f1-score   support

      False       1.00      0.98      0.99       132
       True       0.57      1.00      0.73         4

avg / total       0.99      0.98      0.98       136

      False True 
False   129     3
True      0     4 


День: 2015-08-28 (17 сообщений)
Порог: 1.5
           