In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow
import csv

from transformers import *
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


**ENGLISH LANGUAGE CLEANING**

In [2]:
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def lower(text):
    low_text= text.lower()
    return low_text

def remove_num(text):
    remove= re.sub(r'\d+', '', text)
    return remove

def punct_remove(text):
    punct = re.sub(r"[^\w\s\d]","", text)
    return punct

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def remove_mention(x):
    text=re.sub(r'@\w+','',x)
    return text

def remove_hash(x):
    text=re.sub(r'#\w+','',x)
    return text

def remove_space(text):
    space_remove = re.sub(r"\s+"," ",text).strip()
    return space_remove

**Russian Language Cleaning**

In [3]:
# ", ".join(stopwords.words('russian'))
# STOPWORDS = set(stopwords.words('russian'))

# def remove_urls_ru(text):
#     url_remove = re.compile(r'https?://\S+|www\.\S+')
#     return url_remove.sub(r'', text)

# def remove_html_ru(text):
#     html=re.compile(r'<.*?>')
#     return html.sub(r'',text)

# def lower_ru(text):
#     low_text= text.lower()
#     return low_text

# def remove_num_ru(text):
#     remove= re.sub(r'\d+', '', text)
#     return remove

# def punct_remove_ru(text):
#     punct = re.sub(r"[^\w\s\d]","", text)
#     return punct

# def remove_stopwords_ru(text):
#     """custom function to remove the stopwords"""
#     return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# def remove_mention_ru(x):
#     text=re.sub(r'@\w+','',x)
#     return text

# def remove_hash_ru(x):
#     text=re.sub(r'#\w+','',x)
#     return text

# def remove_space_ru(text):
#     space_remove = re.sub(r"\s+"," ",text).strip()
#     return space_remove

**NLP PROCESS**

In [4]:
# def nlp_process_ru(df):
#     df['question']=df['question'].apply(lambda x:remove_urls_ru(x))
#     df['question']=df['question'].apply(lambda x:remove_html_ru(x))
#     df['question']=df['question'].apply(lambda x:lower_ru(x))
#     df['question']=df['question'].apply(lambda x:remove_num_ru(x))
#     df['question']=df['question'].apply(lambda x:punct_remove_ru(x))
#     df['question']=df['question'].apply(lambda x:remove_stopwords_ru(x))
#     df['question']=df['question'].apply(lambda x:remove_mention_ru(x))
#     df['question']=df['question'].apply(lambda x:remove_hash_ru(x))
#     df['question']=df['question'].apply(lambda x:remove_space_ru(x))
#     return df

def nlp_process(df):
    df['question']=df['question'].apply(lambda x:remove_urls(x))
    df['question']=df['question'].apply(lambda x:remove_html(x))
    df['question']=df['question'].apply(lambda x:lower(x))
    df['question']=df['question'].apply(lambda x:remove_num(x))
    df['question']=df['question'].apply(lambda x:punct_remove(x))
    df['question']=df['question'].apply(lambda x:remove_stopwords(x))
    df['question']=df['question'].apply(lambda x:remove_mention(x))
    df['question']=df['question'].apply(lambda x:remove_hash(x))
    df['question']=df['question'].apply(lambda x:remove_space(x))
    return df

**Attractions Data Collection Questions**

In [5]:
# def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=10, num_beams=10):
#   inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
  
#   outputs = model.generate(
#     **inputs,
#     num_beams=num_beams,
#     num_return_sequences=num_return_sequences,
#   )
#   return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [6]:
# model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
# tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")

In [7]:
# df = pd.read_csv('../datasets/about/data.csv')

In [8]:
# dataset_q = list(df['question'])
# dataset_a = list(df['answer'])
# dataset_t = list(df['type'])

In [9]:
# result = []

# for i, question in enumerate(dataset_q):
#     sentences = get_paraphrased_sentences(model, tokenizer, question, num_beams=10, num_return_sequences=10)

#     for sentence in sentences:
#         entry = {'question': sentence, 'answer': dataset_a[i], 'type': dataset_t[i]}
#         result.append(entry)


In [10]:
# csv_file_path = "../datasets/about/data-proccess.csv"

# with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
#     writer = csv.writer(file)
#     writer.writerow(['question', 'answer', 'type'])
    

#     for entry in result:
#         writer.writerow([entry['question'], entry['answer'], entry['type']])

In [11]:
df = pd.read_csv('../datasets/about/main_dataset.csv')

df.head(5)

Unnamed: 0,question,answer,type
0,What is the title of the attraction?,Ethno-memorial complex «Atameken».,0
1,What is the name of the attraction?,Ethno-memorial complex «Atameken».,0
2,The attraction has a title.,Ethno-memorial complex «Atameken».,0
3,What is the attraction's title?,Ethno-memorial complex «Atameken».,0
4,What is the attraction's name?,Ethno-memorial complex «Atameken».,0


In [12]:
df = nlp_process(df)

df.head(5)

Unnamed: 0,question,answer,type
0,title attraction,Ethno-memorial complex «Atameken».,0
1,name attraction,Ethno-memorial complex «Atameken».,0
2,attraction title,Ethno-memorial complex «Atameken».,0
3,attractions title,Ethno-memorial complex «Atameken».,0
4,attractions name,Ethno-memorial complex «Atameken».,0


In [13]:
import sys
!{sys.executable} -m pip install xgboost



In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from joblib import dump
from xgboost import XGBClassifier

best_models = {}  # Словарь для хранения лучших моделей для каждого типа

# Определение списка кортежей (имя модели, модель)
models = [
    {
        'name': 'Logistic Regression',
        'model': LogisticRegression(max_iter=1000),
        'params': {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]}
    },
    {
        'name': 'Support Vector Machine',
        'model': SVC(),
        'params': {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__gamma': ['scale', 'auto']}
    },
    {
        'name': 'Random Forest',
        'model': RandomForestClassifier(),
        'params': {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20]}
    }
]

models.append({
    'name': 'XGBoost',
    'model': XGBClassifier(),
    'params': {'classifier__learning_rate': [0.01, 0.1, 0.3], 'classifier__max_depth': [3, 5, 7], 'classifier__n_estimators': [50, 100, 200]}
})
best_models = {}  # Словарь для хранения лучших моделей для каждого типа

for type_value in range(16):
    filtered_df = df[df['type'] == type_value]
    
    # Разделение данных на обучающий и тестовый наборы
    X_train, X_test, y_train, y_test = train_test_split(filtered_df['question'], filtered_df['answer'], test_size=0.2, random_state=42)
    
    # Обучение и сохранение модели для текущего типа
    best_accuracy = 0  # Переменная для отслеживания лучшей точности
    best_model_name = None  # Переменная для хранения названия лучшей модели
    
    models = [('Logistic', LogisticRegression())]

    for name, model in models:
        # Построение пайплайна для преобразования текста и классификации
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('classifier', model)
        ])

        # Обучение модели
        pipeline.fit(X_train, y_train)

        # Предсказание на тестовом наборе
        predictions = pipeline.predict(X_test)

        # Оценка качества модели
        accuracy = accuracy_score(y_test, predictions)

        # Сохранение лучшей модели для текущего типа
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = name
            best_models[type_value] = pipeline

    # Сохранение только лучшей модели для текущего типа
    best_model_filename = f"../datasets/about/predict_model_{type_value}.joblib"
    dump(best_models[type_value], best_model_filename)

    # Вывод результатов
    print(f"Best Model for Type {type_value}: {best_model_name}")
    print(f"Accuracy: {best_accuracy}")
    print(f"Model saved to {best_model_filename}")
    print("-------------------------------------")


Best Model for Type 0: Logistic
Accuracy: 0.9473684210526315
Model saved to ../datasets/about/predict_model_0.joblib
-------------------------------------
Best Model for Type 1: Logistic
Accuracy: 0.9024390243902439
Model saved to ../datasets/about/predict_model_1.joblib
-------------------------------------
Best Model for Type 2: Logistic
Accuracy: 0.7692307692307693
Model saved to ../datasets/about/predict_model_2.joblib
-------------------------------------
Best Model for Type 3: Logistic
Accuracy: 0.7619047619047619
Model saved to ../datasets/about/predict_model_3.joblib
-------------------------------------
Best Model for Type 4: Logistic
Accuracy: 0.775
Model saved to ../datasets/about/predict_model_4.joblib
-------------------------------------
Best Model for Type 5: Logistic
Accuracy: 0.7857142857142857
Model saved to ../datasets/about/predict_model_5.joblib
-------------------------------------
Best Model for Type 6: Logistic
Accuracy: 0.7317073170731707
Model saved to ../data

In [15]:
from joblib import load

sentences_to_test = [
   "What is the address?"
]

test = pd.DataFrame({"question": sentences_to_test})


test = nlp_process(test)

index = 2

loaded_model = load(f'../datasets/about/predict_model_{index}.joblib')

# Предсказание ответов на вопросы
for i, sentence in enumerate(test['question']):
    predicted_class = loaded_model.predict([sentence])
    print(f"Question {i + 1}: {sentence}")
    print(f"Predicted Answer: {predicted_class[0]}")
    print("-------------------------------------")


Question 1: address
Predicted Answer: г. Астана, Коргалжинское шоссе, 2.
-------------------------------------


In [16]:


# # Обучение и сохранение моделей
# for name, model in models:
#     # Построение пайплайна для преобразования текста и классификации
#     pipeline = Pipeline([
#         ('vectorizer', CountVectorizer()),
#         ('classifier', model)
#     ])
    
#     # Обучение модели
#     pipeline.fit(X_train, y_train)
    
#     # Сохранение модели в файл
#     model_filename = f"{name.lower().replace(' ', '_')}_model.joblib"
#     dump(pipeline, model_filename)
    
#     # Предсказание на тестовом наборе
#     predictions = pipeline.predict(X_test)
    
#     # Оценка качества модели
#     accuracy = accuracy_score(y_test, predictions)
#     report = classification_report(y_test, predictions)
    
#     # Сохранение отчета
#     reports[name] = {'accuracy': accuracy, 'classification_report': report}
    
#     # Вывод результатов
#     print(f"Model: {name}")
#     print(f"Accuracy: {accuracy}")
#     print("Classification Report:")
#     print(report)
#     print(f"Model saved to {model_filename}")
#     print("-------------------------------------")


In [17]:
# sentences_to_test = [
#     "What is the title of the attraction?",
#     "Where is the attraction located?",
#     "How much does the tour in English language cost?",
#     "Who initiated the opening of the ethno-memorial complex 'Atameken'?",
#     "What discounts are available for disabled people?",
#     "What is phone number?",
#     "what they adress?"
# ]

# model_filename = "logistic_regression_model.joblib"
# loaded_model = load(model_filename)


# test = pd.DataFrame({"question": sentences_to_test})

# test = nlp_process(test)

# for sentence in test['question']:
#     predicted_class = loaded_model.predict([sentence])
#     print(f"Sentence: {sentence}")
#     print(f"Predicted Answer: {predicted_class[0]}")
#     print("-------------------------------------")


In [18]:


# # Ограничение типа вопроса при обучении модели
# X_train, X_test, y_train, y_test = train_test_split(df['question'], df['answer'], test_size=0.2, random_state=42, stratify=df['type'])

# # Преобразование текстов в векторы признаков с помощью TF-IDF
# vectorizer = TfidfVectorizer()
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)

# # Обучение классификатора RandomForest
# classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# classifier.fit(X_train_tfidf, y_train)

# # Предсказание на тестовом наборе данных
# y_pred = classifier.predict(X_test_tfidf)

# # Оценка точности модели
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [19]:
# question_type = 0
# new_sentences = [
#     "What is the title?",
#     "Where is located?",
#     "When they are closed?",
#     "Is the museum open on weekends?",
#     "Where i can get more information?",
# ]


# test = pd.DataFrame({"question": new_sentences})
# test = nlp_process(test)


# X_new = vectorizer.transform(test['question'])

# # Reset the index of the DataFrame after filtering based on question type
# filtered_indices = df[df['type'] == question_type].index
# filtered_df = df[df['type'] == question_type].reset_index(drop=True)

# # Use the reset DataFrame indices for filtering
# filtered_X_new = X_new[:, filtered_df.index]

# num_features = X_new.shape[1]
# full_filtered_X_new = np.zeros((X_new.shape[0], num_features))
# full_filtered_X_new[:, filtered_df.index] = filtered_X_new.toarray()

# predicted_answer = classifier.predict(full_filtered_X_new)
# print("Predicted Answer:", predicted_answer)
