In [2]:
import nltk
import spacy
import bz2
import numpy as np
import matplotlib.pyplot as plt 
import xml.etree.ElementTree as ET
import joblib

from typing import *
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model

import pandas as pd
from sklearn.model_selection import train_test_split
import openpyxl
from pydantic import BaseModel
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [5]:
!python3 -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8.0/ru_core_news_sm-3.8.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')


In [6]:
nltk.download('punkt')
nlp = spacy.load("uk_core_news_lg")
nlp_2 = spacy.load("ru_core_news_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/taraskozak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
scaler = joblib.load('../server/scaler.pkl')

In [21]:
model_2 = load_model("../server/model_2.h5")

# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
bert_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")

# Define unique labels
unique_labels = ['Заперечення', 'Виправдовування', 'Заклик', 
                 'Розпалювання ворожнечі та ненависті', 
                 'Приниження національної честі та гідності', 
                 'Просто текст']

label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
def preprocess_text(text):
    nlp = spacy.load("ru_core_news_sm")  # Adjust to your language model
    doc = nlp(str(text).lower()) 
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

def get_avg_w2v(text, tokenizer, bert_model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    vector = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return vector

In [11]:
def count_named_entities(text):
    doc = nlp(text)
    location_count = sum(1 for ent in doc.ents if ent.label_ == "LOC")  # Count locations
    organization_count = sum(1 for ent in doc.ents if ent.label_ == "ORG")  # Count organizations
    return location_count, organization_count
def avg_noun_verb_ratio(text):
    doc = nlp(text)
    ratios = []
    
    for sent in doc.sents:  # Process each sentence separately
        nouns = sum(1 for token in sent if token.pos_ == "NOUN")
        verbs = sum(1 for token in sent if token.pos_ == "VERB")
        if verbs > 0:
            ratios.append(nouns / verbs)  # Compute noun/verb ratio
        else:
            ratios.append(0)  # Avoid division by zero

    return sum(ratios) / len(ratios) if ratios else 0  # Compute the average
def calculate_subj(text):
    
    subj_dict_synt = {}

    tree = ET.parse("../server/translated_output.xml")
    root = tree.getroot()

    # Extract words and polarity
    for word in root.findall("word"):
        word_form = word.get("form")
        polarity = float(word.get("subjectivity", 0))  # Default polarity = 0 if not present
        subj_dict_synt[word_form] = polarity
    
    if not isinstance(text, str):
        return 0.0  # Return neutral score for missing values
    words = text.split()  # Tokenize text
    score = sum(subj_dict_synt.get(word, 0) for word in words)  # Sum word polarities
    return score
def calculate_sentiment(text):
    
    sentiment_dict_synt = {}

    tree = ET.parse("../server/translated_output.xml")
    root = tree.getroot()

    # Extract words and polarity
    for word in root.findall("word"):
        word_form = word.get("form")
        polarity = float(word.get("polarity", 0))  # Default polarity = 0 if not present
        sentiment_dict_synt[word_form] = polarity
    
    if not isinstance(text, str):
        return 0.0  # Return neutral score for missing values
    words = text.split()  # Tokenize text
    score = sum(sentiment_dict_synt.get(word, 0) for word in words)  # Sum word polarities
    return score

In [12]:
def preprocess_text(text: str):
    
    # Load your SpaCy model
    nlp = spacy.load('uk_core_news_lg')  # Or the model you're using for lemmatization

    # Load GloVe word vectors
    glove_path = "../server//news.lowercased.lemmatized.glove.300d.bz2"
    word_vectors = {}

    # Load GloVe vectors into memory
    with bz2.open(glove_path, "rt", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            word_vectors[word] = vector
            
    # --- Step 1: Text Preprocessing ---
    
    # 1.1: Tokenize and lemmatize using Spacy
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    
    # 1.2: Extract features like punctuation
    has_colons = 1 if ':' in text else 0
    has_hyphens = 1 if '-' in text else 0
    has_quotmarks = 1 if '"' in text else 0
    
    # 1.3: Sentiment & subjectivity (assuming functions are available for sentiment analysis)
    sentiment = calculate_sentiment(text)
    subjectiveness = calculate_subj(text)
    
    # 1.4: Noun-verb ratio calculation (assuming it's already done)
    noun_verb_ratio = avg_noun_verb_ratio(text)
    
    # 1.5: Count location and organization mentions (assuming functions are available)
    location_count, organization_count = count_named_entities(text)
    
    # --- Step 2: Vectorize the text (using GloVe) ---
    vectors = []
    for token in doc:
        word = token.lemma_.lower()
        if word in word_vectors:
            vectors.append(word_vectors[word])
    
    # If there are no valid words found in the GloVe vocabulary, return a zero vector
    if not vectors:
        vector = np.zeros(300)  # 300 is the dimension of GloVe vectors
    else:
        vector = np.mean(vectors, axis=0)
    
    # --- Step 3: Combine all features into a single vector ---
    scaled_features = scaler.transform([[sentiment, subjectiveness, noun_verb_ratio, location_count, organization_count]])[0]

    # Combine all features
    features = np.concatenate([
        vector,  # GloVe vector (300D)
        [has_colons, has_hyphens, has_quotmarks],  # Binary features
        scaled_features  # Standardized numeric features
    ])
    
    return features

In [23]:
def classify_sentence(text):
    # Preprocess the input sentence
    processed_text = preprocess_text(text)
    
    # Convert the processed text to a vector (наприклад, Word2Vec або будь-який інший)
    vector = get_avg_w2v(processed_text, tokenizer, bert_model)
    
    vector = vector[:767]

    # Передбачення ймовірностей
    probabilities = model_2.predict(np.array([vector]))

    # Нормалізовані ймовірності (якщо необхідно)
    probabilities_dict = {
        id_to_label[i]: float(probabilities[0][i]) for i in range(len(probabilities[0]))
    }

    # Найвірогідніший клас
    predicted_label = id_to_label[np.argmax(probabilities)]

    return {
        "text": text,
        "processed_text": processed_text,
        "label": predicted_label,
        "probabilities": probabilities_dict,
    }

In [20]:
model = load_model("../server/model.keras")

  saveable.load_own_variables(weights_store.get(inner_path))


In [15]:
# Example text
text = "Хочете допомогу Києву, платіть колоніальні репарації: влада Євросоюзу зробила велику помилку на переговорах з Латинською Америкою."

# Preprocess the text and get the feature vector
features = preprocess_text(text)

# Now you can use this feature vector as input to your trained model
prediction = model.predict(np.expand_dims(features, axis=0))  # Add batch dimension

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step




In [16]:
predicted_label = (prediction >= 0.5).astype(int)
print(predicted_label)  # Output: [[1]]

if(predicted_label == [[1]]):
    

[[1]]
