In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.pipeline import FeatureUnion
import pandas as pd
import numpy as np
import itertools
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)
from tqdm import tqdm
import string
import re

tqdm.pandas()

# Preprocess data: lemmatize and remove punctuation

In [None]:
def rename_group(name):
    d = {'Arkhangelskie': 'Архангельские',
         'Desninskie': 'Верхне-Деснинские',
         'Donskie': 'Донские',
         'Kostromskie': 'Костромские',
         'Mezhzonalnie': 'Межзональная группа Б',
         'Novgorodskie': 'Новгородские',
         'Povolzkie': 'Владимирско-Поволжские',
         'Pskovskie': 'Псковские',
         'Ryazanskie': 'Рязанские',
         'Seligerskie': 'Селигеро-Торжковские'}
    return d[name]

df = pd.read_excel('manifest_balanced.xlsx')
df['variant_rus'] = df['Variant'].apply(rename_group)

In [None]:
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)

In [None]:
punct = re.sub('-', '', string.punctuation)

In [None]:
def preprocessing(text):
    try:
        text = text.lower()
        text = text.translate(str.maketrans('', '', punct))
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        lemmas = []
        for token in doc.tokens:
            if token.pos != 'PUNCT' and token.text != '=':
                lemmas.append(token.lemma)
        return ' '.join(lemmas)
    except:
        return None

In [None]:
df['lemmas'] = df.progress_apply(lambda x: preprocessing(x['text']), axis=1)
df = df.dropna()

# GridSearch

Split on train, validation and test

In [None]:
train = pd.DataFrame()
valid = pd.DataFrame()
test = pd.DataFrame()
for variant in list(df['Variant'].unique()):
    train_df, valid_test_df = train_test_split(df[df['Variant']==variant], test_size=0.3, random_state=55)
    valid_df, test_df = train_test_split(valid_test_df, test_size=0.5, random_state=55)
    train = pd.concat([train, train_df])
    valid = pd.concat([valid, valid_df])
    test = pd.concat([test, test_df])

# CHAR

In [None]:
pipeline=Pipeline([('tfidf', TfidfVectorizer()),
                   ('ovr', LinearSVC(loss='squared_hinge', max_iter=100000, multi_class='ovr'))
                    ])
parameters = {
    'tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (2, 3), (2, 4), (2, 5), (2, 6)],
    'tfidf__analyzer': ['char'],
    'tfidf__use_idf': [True, False],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__max_df': [0.75, 0.85, 1.0],
    'ovr__C': [0.1, 1, 10],
    'ovr__tol': [1e-5, 1e-4, 1e-3]
    
}

train_y = train['Variant'].values.reshape(-1,1)
gs = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='f1_macro', verbose=3)
gs.fit(train['lemmas'], train_y.ravel())

In [None]:
gs_result_char = pd.concat([pd.DataFrame(gs.cv_results_["params"]), 
                            pd.DataFrame(gs.cv_results_["mean_test_score"], 
                                         columns=["f1_macro"])],axis=1)
gs_result_char.to_excel('GridSearchCV_LinearSVM_char.xlsx')

In [None]:
pipeline=Pipeline([('tfidf', TfidfVectorizer(analyzer='char', max_df=0.85, ngram_range=(1,6))),
                   ('ovr', LinearSVC(loss='squared_hinge', max_iter=100000, multi_class='ovr', C=1))])

train_y = train['Variant'].values.reshape(-1,1)
pipeline.fit(train['lemmas'], train_y.ravel())

In [None]:
print(classification_report(test['Variant'], pipeline.predict(test['lemmas'])))

In [None]:
ConfusionMatrixDisplay.from_predictions(test['Variant'], pipeline.predict(test['lemmas']),
                               xticks_rotation='vertical')

# WORD

In [None]:
pipeline=Pipeline([('tfidf', TfidfVectorizer()),
                   ('ovr', LinearSVC(loss='squared_hinge', max_iter=100000, multi_class='ovr')),
                    ])
parameters = {
    'tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (2, 3), (2, 4), (2, 5), (2, 6)],
    'tfidf__analyzer': ['word'],
    'tfidf__use_idf': [True, False],
    'tfidf__sublinear_tf': [True, False],
    'tfidf__max_df': [0.75, 0.85, 1.0],
    'ovr__C': [0.1, 1, 10],
    'ovr__tol': [1e-5, 1e-4, 1e-3]
    
}

train_y = train['Variant'].values.reshape(-1,1)
gs = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='f1_macro', verbose=3)
gs.fit(train['lemmas'], train_y.ravel())

In [None]:
gs_result_char = pd.concat([pd.DataFrame(gs.cv_results_["params"]), 
                            pd.DataFrame(gs.cv_results_["mean_test_score"], 
                                         columns=["f1_macro"])],axis=1)
gs_result_char.to_excel('GridSearchCV_LinearSVM_word.xlsx')

In [None]:
pipeline=Pipeline([('tfidf', TfidfVectorizer(analyzer='word', max_df=0.75, ngram_range=(1,2))),
                   ('ovr', LinearSVC(loss='squared_hinge', max_iter=100000, multi_class='ovr', C=1))])

train_y = train['Variant'].values.reshape(-1,1)
pipeline.fit(train['lemmas'], train_y.ravel())

In [None]:
print(classification_report(test['Variant'], pipeline.predict(test['lemmas'])))

In [None]:
ConfusionMatrixDisplay.from_predictions(test['Variant'], pipeline.predict(test['lemmas']),
                               xticks_rotation='vertical')

# CHAR + WORD

In [None]:
char_tfidf = TfidfVectorizer(analyzer='char')
word_tfidf = TfidfVectorizer(analyzer='word')
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf),
                     ('ovr', LinearSVC(loss='squared_hinge', max_iter=100000, multi_class='ovr'))])

parameters = {
    'tfidf__char__ngram_range': [(2, 3), (2, 4), (2, 5), (2, 6)],
    'tfidf__word__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (2, 3)],
    'tfidf__char__max_df': [0.75, 0.85, 1.0],
    'tfidf__word__max_df': [0.75, 0.85, 1.0],
    'ovr__C': [1],
    'ovr__tol': [1e-5]
    
    
}

train_y = train['Variant'].values.reshape(-1,1)
gs = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='f1_macro', verbose=3)
gs.fit(train['lemmas'], train_y.ravel())

In [None]:
gs_result_char = pd.concat([pd.DataFrame(gs.cv_results_["params"]), 
                            pd.DataFrame(gs.cv_results_["mean_test_score"], 
                                         columns=["f1_macro"])],axis=1)
gs_result_char.to_excel('GridSearchCV_LinearSVM_char_word.xlsx')

In [None]:
char_tfidf = TfidfVectorizer(analyzer='char', max_df=0.85, ngram_range=(2,5))
word_tfidf = TfidfVectorizer(analyzer='word', max_df=0.75, ngram_range=(1,3))
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])
pipeline = Pipeline([('tfidf', tfidf), 
                     ('ovr', LinearSVC(loss='squared_hinge', max_iter=100000, multi_class='ovr'))])
train_y = train['Variant'].values.reshape(-1,1)
pipeline.fit(train['lemmas'], train_y.ravel())

In [None]:
print(classification_report(test['Variant'], pipeline.predict(test['lemmas'])))

In [None]:
labs = ['Архангельские', 'Верхне-Деснинские', 'Донские', 'Костромские', 'Межзональная группа Б', 
        'Новгородские', 'Владимирско-Поволжские', 'Псковские', 'Рязанские', 'Селигеро-Торжковские']

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6), dpi=300)
disp = ConfusionMatrixDisplay.from_predictions(test['Variant'], pipeline.predict(test['lemmas']),
                                        display_labels = labs, xticks_rotation='vertical')
disp = disp.plot(cmap=plt.cm.binary,values_format='g',xticks_rotation='vertical')
plt.show()
plt.savefig('heatmap_svm.jpg', bbox_inches='tight')

In [None]:
plt.savefig('heatmap_svm.jpg', bbox_inches='tight')

# Get coefficients for each n-gram

In [None]:
feature_coefficients = pipeline['ovr'].coef_
feature_coefficients[0]

coeff_list = pd.DataFrame(
    {'feature': pipeline['tfidf'].get_feature_names_out(), 
     'Arkhangelskie': feature_coefficients[0], 'Desninskie': feature_coefficients[1], 'Donskie': feature_coefficients[2], 
     'Kostromskie': feature_coefficients[3], 'Mezhzonalnie': feature_coefficients[4], 'Novgorodskie': feature_coefficients[5],
     'Povolzkie': feature_coefficients[6], 'Pskovskie': feature_coefficients[7], 'Ryazanskie': feature_coefficients[8], 
     'Seligerskie': feature_coefficients[9]
    })


In [None]:
coeff_list.to_excel('LinearSVM_char_word_coeffs.xlsx')

In [None]:
feature_names = vectorizer.get_feature_names_out()
corpus_index = [n for n in corpus]
pd.DataFrame(X.todense(), index=corpus_index, columns=feature_names)

In [None]:
svm = LinearSVC(random_state=42)
ovr_classifier = OneVsRestClassifier(svm)
ovr_classifier = ovr_classifier.fit(X_train, y_train)
matrix = plot_confusion_matrix(ovr_classifier, X_test, y_test)