In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import classification_report
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import stopwords
import spacy
# import pymorphy2
from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE


In [49]:
dataset = pd.read_csv('data.csv', encoding='utf-8', sep=',')


In [50]:
dataset.head(4)


Unnamed: 0,Name,age,gender,Date,Month,Year,Type,Location,Description,Type of bear,Hunter,Grizzly,Hikers,Only one killed,Latitude,Longitude
0,Mary Porterfield,3.0,female,19/05/1901,May,1901,Wild,"Job, West Virginia",The children were gathering flowers near their...,Black bear,0,0,0,0,38.864277,-79.556998
1,Wilie Porterfield,5.0,male,19/05/1901,May,1901,Wild,"Job, West Virginia",The children were gathering flowers near their...,Black bear,0,0,0,0,38.864277,-79.556998
2,Henry Porterfield,7.0,male,19/05/1901,May,1901,Wild,"Job, West Virginia",The children were gathering flowers near their...,Black bear,0,0,0,0,38.864277,-79.556998
3,John Dicht,18.0,male,24/11/1906,Nov,1906,Wild,"Elk County, Pennsylvania","Thinking the bear was dead, Dicht began skinni...",Black bear,0,0,0,1,41.437362,-78.626009


In [51]:
dataset = dataset.rename(columns={'Type of bear':'Тип медведя'})

In [52]:
dataset = dataset[['Description','Тип медведя']]

In [53]:
dataset.to_string()




In [54]:
missing_value_count = dataset.isna().sum()
missing_value_count

Description    0
Тип медведя    0
dtype: int64

In [55]:
dataset.drop_duplicates()


Unnamed: 0,Description,Тип медведя
0,The children were gathering flowers near their...,Black bear
3,"Thinking the bear was dead, Dicht began skinni...",Black bear
4,After a bear escaped from a cage at Elysian Gr...,Black bear
5,Welch was killed at a camp near Sylvan Pass wh...,Brown bear
6,Duret was attacked and partially devoured by a...,Brown bear
...,...,...
160,Soltis was backpacking alone along the Eagle R...,Brown bear
161,A polar bear approached a man and his children...,Polar Bear
162,"Uptain, a guide for Martin Outfitters, was cle...",Brown bear
163,Montoya was working at a remote mining site on...,Brown bear


In [56]:
# Загружаем модель spaCy для английского языка
nlp = spacy.load("en_core_web_sm")

# Функция для предобработки текста


def preprocess_text(text):
    # Обработка текста с помощью spaCy
    doc = nlp(text)

    # Лемматизация и удаление стоп-слов
    tokens = [token.lemma_.lower()
              for token in doc if not token.is_stop and not token.is_punct]

    # Объединение токенов в строку
    clean_text = ' '.join(tokens)

    return clean_text


# Применяем очистку текста к данным
dataset['Очищенное описание инцидента'] = dataset['Description'].apply(
    preprocess_text)


In [57]:
dataset


Unnamed: 0,Description,Тип медведя,Очищенное описание инцидента
0,The children were gathering flowers near their...,Black bear,child gather flower near home attack member se...
1,The children were gathering flowers near their...,Black bear,child gather flower near home attack member se...
2,The children were gathering flowers near their...,Black bear,child gather flower near home attack member se...
3,"Thinking the bear was dead, Dicht began skinni...",Black bear,think bear dead dicht begin skin bear immediat...
4,After a bear escaped from a cage at Elysian Gr...,Black bear,bear escape cage elysian grove pleasure park b...
...,...,...,...
161,A polar bear approached a man and his children...,Polar Bear,polar bear approach man child sentry island ma...
162,"Uptain, a guide for Martin Outfitters, was cle...",Brown bear,uptain guide martin outfitters clean elk clien...
163,Montoya was working at a remote mining site on...,Brown bear,montoya work remote mining site admiralty isla...
164,The mother and child were attacked near their ...,Brown bear,mother child attack near cabin trip manage tra...


In [58]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset['Очищенное описание инцидента']).toarray()
# y = dataset['Тип медведя']

In [59]:
#Преобразование меток в числовой формат
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(dataset['Тип медведя'])
# y = to_categorical(y)  # Преобразование в категориальный формат

In [61]:
# Фильтрация тестовых данных (по 5 примеров из каждого класса)
brown_bears = dataset[dataset['Тип медведя'] == 'Brown bear']
black_bears = dataset[dataset['Тип медведя'] == 'Black bear']
polar_bears = dataset[dataset['Тип медведя'] == 'Polar Bear']

brown_bears_test = brown_bears.sample(n=5, random_state=42)
black_bears_test = black_bears.sample(n=5, random_state=42)
polar_bears_test = polar_bears.sample(n=5, random_state=42)

# Объединение тестовых данных
test_data = pd.concat([brown_bears_test, black_bears_test, polar_bears_test])

# Удаление тестовых данных из исходного набора, чтобы оставить только тренировочные
train_data = dataset.drop(test_data.index)


In [62]:

# Векторизация для тренировочных данных
X_train = vectorizer.transform(
    train_data['Очищенное описание инцидента']).toarray()
y_train = label_encoder.transform(train_data['Тип медведя'])

X_test = vectorizer.transform(
    test_data['Очищенное описание инцидента']).toarray()
y_test = label_encoder.transform(test_data['Тип медведя'])


In [63]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)




In [64]:
print(pd.Series(y_balanced).value_counts())


0    80
1    80
2    80
Name: count, dtype: int64


In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.3, random_state=42)


In [66]:
# Создание и обучение модели градиентного бустинга
model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Предсказание классов на тестовом наборе
y_pred = model.predict(X_test)

# Оценка различных метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# auc_roc = roc_auc_score(y_test, y_pred, average='weighted')

# Вывод всех метрик
print(f'accuracy: {accuracy:.4f}')
print(f'precision: {recall:.4f}')
print(f'recall: {precision:.4f}')
print(f'F1-мера: {f1:.4f}')
# print(f'AUC-ROC: {auc_roc:.4f}')

# Матрица ошибок
cm = confusion_matrix(y_test, y_pred)
print("Матрица ошибок:")
print(cm)


accuracy: 0.8056
precision: 0.8056
recall: 0.8072
F1-мера: 0.8056
Матрица ошибок:
[[17  6  0]
 [ 8 17  0]
 [ 0  0 24]]


In [73]:
import matplotlib.pyplot as plt
import numpy as np

# Установим количество перемешиваний
n_shuffles = 10  # Можно изменить на любое нужное количество

# Список для хранения точности на каждом шаге
accuracies = []

# Перемешиваем тренировочные данные несколько раз
for i in range(n_shuffles):
    # Перемешиваем тренировочные данные случайным образом
    X_train_shuffled, y_train_shuffled = shuffle(
        X_train, y_train, random_state=i)

    # Балансировка классов с использованием SMOTE
    X_train_balanced, y_train_balanced = smote.fit_resample(
        X_train_shuffled, y_train_shuffled)

    # Создание и обучение модели
    model = GradientBoostingClassifier(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train_balanced, y_train_balanced)

    # Предсказание на тестовой выборке
    y_pred = model.predict(X_test)

    # Оценка точности
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Построение гистограммы точностей
plt.figure(figsize=(8, 6))
plt.hist(accuracies, bins=n_shuffles, edgecolor='black')
plt.title('Распределение точности после перемешивания признаков')
plt.xlabel('Точность')
plt.ylabel('Количество повторений')
plt.grid(True)
plt.show()


ImportError: cannot import name 'mplDeprecation' from 'matplotlib._api.deprecation' (c:\Python\lib\site-packages\matplotlib\_api\deprecation.py)