**Preprocessing**

In [1]:
import os, spacy

In [13]:
nlp = spacy.load("ru_core_news_lg")
nlp.max_length = 4000000

In [16]:
def lemmatize(filename, genre):

    with open('../datasets/' + genre + '/' + filename, 'r', encoding='utf-8') as f:
        lines = f.read()

    lemmas = []
    doc = nlp(lines, disable = ['ner', 'parser'])
    for token in doc:
        if token.lemma_.isalpha():
            lemmas.append(token.lemma_)

    path = '../datasets/lemmatized/' + genre
    if not os.path.exists(path):
        os.makedirs(path)

    with open(path + '/' + filename, 'w', encoding='utf-8') as f:
        f.write(' '.join(lemmas))

    return doc

In [None]:
genres = ['ballads', 'elegies', 'songs', 'novels']

for genre in genres:

    filenames = os.listdir('../datasets/' + genre)

    for file in filenames:
        if file != '.DS_Store':
            spacy_doc = lemmatize(file, genre)

**TF-IDF**
using https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer.fit_transform
https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py

In [9]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time

In [10]:
genres = ['ballads', 'elegies', 'songs', 'novels']
all_filenames_paths = []

for genre in genres:
    filenames = os.listdir('../datasets/lemmatized/' + genre)
    for file in filenames:
        all_filenames_paths.append('../datasets/lemmatized/' + genre + '/' + file)


vectorizer = TfidfVectorizer(input='filename', max_df=0.5, min_df=5)
X_tfidf = vectorizer.fit_transform(all_filenames_paths)

In [11]:
vectorizer.get_feature_names_out()

array(['ab', 'aber', 'absolument', ..., 'ён', 'ёрзать', 'ёрш'],
      dtype=object)

In [12]:
print(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}")

n_samples: 1828, n_features: 63333


In [13]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer


lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
t0 = time()
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"LSA done in {time() - t0:.3f} s")
print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

LSA done in 13.578 s
Explained variance of the SVD step: 29.6%


In [14]:
labels = [fname.split('/')[3] for fname in all_filenames_paths]

In [16]:
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

In [17]:
true_k

4

In [18]:
from collections import defaultdict
from sklearn import metrics

evaluations = []
evaluations_std = []


def fit_and_evaluate(km, X, name=None, n_runs=5):
    name = km.__class__.__name__ if name is None else name

    train_times = []
    scores = defaultdict(list)
    for seed in range(n_runs):
        km.set_params(random_state=seed)
        t0 = time()
        km.fit(X)
        train_times.append(time() - t0)
        scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
        scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
        scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
        scores["Adjusted Rand-Index"].append(
            metrics.adjusted_rand_score(labels, km.labels_)
        )
        scores["Silhouette Coefficient"].append(
            metrics.silhouette_score(X, km.labels_, sample_size=2000)
        )
    train_times = np.asarray(train_times)

    print(f"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s ")
    evaluation = {
        "estimator": name,
        "train_time": train_times.mean(),
    }
    evaluation_std = {
        "estimator": name,
        "train_time": train_times.std(),
    }
    for score_name, score_values in scores.items():
        mean_score, std_score = np.mean(score_values), np.std(score_values)
        print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
        evaluation[score_name] = mean_score
        evaluation_std[score_name] = std_score
    evaluations.append(evaluation)
    evaluations_std.append(evaluation_std)

In [20]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=true_k,
    max_iter=100,
    n_init=1,
)

fit_and_evaluate(kmeans, X_lsa, name="KMeans\nwith LSA on tf-idf vectors")

clustering done in 0.06 ± 0.04 s 
Homogeneity: 0.758 ± 0.005
Completeness: 0.727 ± 0.010
V-measure: 0.742 ± 0.008
Adjusted Rand-Index: 0.759 ± 0.022
Silhouette Coefficient: 0.141 ± 0.001


In [29]:
kmeans.labels_

array([0, 2, 2, ..., 1, 1, 1], dtype=int32)

In [32]:
import pandas as pd

files = [fname.split('/')[4] for fname in all_filenames_paths]
data = {'file_names': files, 'genres': labels, 'lsa_tf_idf_labels': kmeans.labels_}
df = pd.DataFrame(data)

In [33]:
df

Unnamed: 0,file_names,genres,lsa_tf_idf_labels
0,175_1842_E.txt,ballads,0
1,47_1842_Lyubimov.txt,ballads,2
2,183_1844_Zhadovskaya.txt,ballads,2
3,157_1847_Nekrasov.txt,ballads,2
4,206_1841_Tolstoy.txt,ballads,2
...,...,...,...
1823,sukhonin.na_rubezhe_stoletiy.txt,novels,1
1824,boborykin.dolgo_li.txt,novels,1
1825,merder.vorotyntsevy.txt,novels,1
1826,dal.pavel_alekseyevich_igrivyy.txt,novels,1


In [34]:
df.to_csv('out.csv')