# Пайплайн для лог-регрессии

In [104]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import spacy
from sklearn.preprocessing import label_binarize
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [105]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
df = pd.read_csv('Processed_data.csv')

In [129]:
df = df[~df['target'].isin(['unit 2', '?'])]

In [130]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    doc = nlp(' '.join(filtered_tokens))
    lemmatized_tokens = ' '.join([token.lemma_ for token in doc])
    return lemmatized_tokens

In [131]:
df['qst_processed'] = df['question'].apply(preprocess_text)

In [132]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['target'])

In [133]:
df.drop(columns=['question', 'target', 'file'], inplace=True)

In [134]:
df['qst_len'] = df['qst_processed'].apply(len)

In [135]:
X = df.copy()

# Лучшая модель (Лог. регрессия)

In [136]:
pipeline = Pipeline([
    ('tfidf', ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(stop_words='english'), 'qst_processed')
        ],
        remainder='passthrough'
    )),
    ('model', LogisticRegression(C=103, max_iter=10000, multi_class='ovr'))
])

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=43)

In [138]:
pipeline.fit(X_train, y_train)

In [139]:
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
test_pred = pipeline.predict(X_test)
test_proba = pipeline.predict_proba(X_test)
print(classification_report(y_test, test_pred))
print('ROC_AUC', roc_auc_score(y_test_bin, test_proba))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.62      0.83      0.71         6
           2       0.90      0.75      0.82        12
           3       0.86      0.89      0.87        27
           4       1.00      0.86      0.92         7

    accuracy                           0.85        61
   macro avg       0.85      0.84      0.84        61
weighted avg       0.86      0.85      0.85        61

ROC_AUC 0.9486014732233219


# Тест на новом экзамене

In [185]:
new_questions = pd.read_excel('./new_questions.xlsx')

In [186]:
new_questions['qst_processed'] = new_questions['question'].apply(preprocess_text)

In [187]:
new_questions['qst_len'] = new_questions['qst_processed'].apply(len)

In [188]:
new_questions['year'] = 2022

In [189]:
questions_full = new_questions['question']

In [190]:
new_questions.drop(columns=['question', 'file'], inplace=True)

In [191]:
new_qst_pred = pipeline.predict(new_questions)

In [192]:
new_qst_pred

array([3, 4, 0, 3, 1, 2, 3, 0, 2, 2])

In [193]:
qst_to_target = pd.DataFrame(data=list(zip(questions_full, encoder.inverse_transform(new_qst_pred))), columns=['question', 'target'])

In [194]:
qst_to_target

Unnamed: 0,question,target
0,Define the term ‘quantitative data’ . (Extract...,meeting customer needs
1,Construct a supply and demand diagram to show ...,the market
2,Analyse two entrepreneurial characteristics sh...,entrepreneurs and leaders
3,Discuss the benefits for Vaibhav Anant of usin...,meeting customer needs
4,Assess if there is likely to be a trade-off be...,managing people
5,"Define the term ‘social trends’ . (Extract D, ...",marketing mix and strategy
6,"Using the data, calculate the increase in mont...",meeting customer needs
7,Analyse two benefits for Aldi of having emplo...,entrepreneurs and leaders
8,Assess the benefits for Aldi of sponsoring th...,marketing mix and strategy
9,Assess the likely impact of price comparison w...,marketing mix and strategy


In [195]:
qst_to_target.to_excel('new_questions_pred_log_reg.xlsx')

# Предсказание Naive-bayes

In [196]:
import pickle

In [204]:
with open('naive_bayes.pkl', 'rb') as f:
    naive_bayes = pickle.load(f)

In [205]:
new_qst_bayes_pred = naive_bayes.predict(new_questions)

In [206]:
new_qst_bayes_pred

array([3, 4, 0, 3, 0, 3, 4, 0, 2, 2])

In [207]:
qst_to_target_bayes = pd.DataFrame(data=list(zip(questions_full, encoder.inverse_transform(new_qst_bayes_pred))), columns=['question', 'target'])

In [None]:
qst_to_target_bayes

Unnamed: 0,question,target
0,Define the term ‘quantitative data’ . (Extract...,meeting customer needs
1,Construct a supply and demand diagram to show ...,the market
2,Analyse two entrepreneurial characteristics sh...,entrepreneurs and leaders
3,Discuss the benefits for Vaibhav Anant of usin...,meeting customer needs
4,Assess if there is likely to be a trade-off be...,entrepreneurs and leaders
5,"Define the term ‘social trends’ . (Extract D, ...",meeting customer needs
6,"Using the data, calculate the increase in mont...",the market
7,Analyse two benefits for Aldi of having emplo...,entrepreneurs and leaders
8,Assess the benefits for Aldi of sponsoring th...,marketing mix and strategy
9,Assess the likely impact of price comparison w...,marketing mix and strategy


In [209]:
qst_to_target_bayes.to_excel('new_questions_pred_bayes.xlsx')