# Пайплайн для лог-регрессии

In [104]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import spacy
from sklearn.preprocessing import label_binarize
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [105]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [106]:
df = pd.read_csv('Processed_data.csv')

In [107]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    doc = nlp(' '.join(filtered_tokens))
    lemmatized_tokens = ' '.join([token.lemma_ for token in doc])
    return lemmatized_tokens

In [108]:
df['qst_processed'] = df['question'].apply(preprocess_text)

In [109]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['target'])

In [110]:
df.drop(columns=['question', 'target', 'file'], inplace=True)

In [111]:
df['qst_len'] = df['qst_processed'].apply(len)

In [112]:
X = df.copy()

In [113]:
pipeline = Pipeline([
    ('tfidf', ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(stop_words='english'), 'qst_processed')
        ],
        remainder='passthrough'
    )),
    ('model', LogisticRegression(C=103, max_iter=10000, multi_class='ovr'))
])

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=43)

In [115]:
pipeline.fit(X_train, y_train)

In [117]:
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
test_pred = pipeline.predict(X_test)
test_proba = pipeline.predict_proba(X_test)
print(classification_report(y_test, test_pred))
print('ROC_AUC', roc_auc_score(y_test_bin, test_proba))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00         4
           2       1.00      0.75      0.86         8
           3       0.60      0.90      0.72        10
           4       0.74      0.80      0.77        25
           5       1.00      0.82      0.90        11
           6       0.86      0.67      0.75         9

    accuracy                           0.79        68
   macro avg       0.74      0.70      0.71        68
weighted avg       0.81      0.79      0.79        68

ROC_AUC 0.9361094490518383
