In [1]:
import pandas as pd

# Load the dataset
sports_df = pd.read_excel('queries_dataset.xlsx')

# Display the first few rows of the dataframe
display(sports_df.head())

# Show the columns and data types
print(sports_df.dtypes)

Unnamed: 0,Query,Field
0,What is the capital of Mexico?,Geography
1,How does quantum computing work?,Science
2,What is the stock price of Samsung?,Finance
3,How to learn Go programming?,Computer Science
4,What is bioinformatics in data science?,Data Science


Query    object
Field    object
dtype: object


In [2]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, accuracy_score

# # Vectorizing the queries
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(sports_df['Query'])
# y = sports_df['Field']

# # Splitting the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# # Training a Logistic Regression model
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)

# # Predicting and evaluating the model
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print('Model trained and evaluated.')
# print('Accuracy Score:', accuracy)
# print('Classification Report:\
# ', report)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Vectorizing the queries
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sports_df['Query'])
y = sports_df['Field']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Training a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predicting
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

# Convert metrics to percentage
accuracy_percent = accuracy * 100
for label, metrics in report.items():
    if label == 'accuracy':
        report[label] = metrics * 100
    else:
        for metric, value in metrics.items():
            if metric != 'support':
                report[label][metric] = value * 100

print('Model trained and evaluated.')
print('Accuracy Score:', accuracy_percent, '%')
print('Classification Report:\n', report)


Model trained and evaluated.
Accuracy Score: 100.0 %
Classification Report:
 {'Computer Science': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 19.0}, 'Data Science': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 16.0}, 'Economy': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 15.0}, 'Finance': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 43.0}, 'Geography': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 21.0}, 'Health': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 49.0}, 'History': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 18.0}, 'Science': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 28.0}, 'Sports': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 181.0}, 'Trends': {'precision': 100.0, 'recall': 100.0, 'f1-score': 100.0, 'support': 10.0}, 'accuracy': 100.0, 'macro avg': {'precision': 10

In [4]:
# Example queries to test the model
example_queries = [
    'Who won the last NBA championship?',
    'What are the health benefits of yoga?',
    'Benefits of eating good food?',
    'What are the latest trends in AI technology?',
    'Tell me about the history of the Roman Empire'
]

# Vectorizing the example queries using the same vectorizer
example_queries_vectorized = vectorizer.transform(example_queries)

# Predicting the fields of interest for the example queries
predicted_fields = model.predict(example_queries_vectorized)

# Displaying the queries and their predicted fields
for query, field in zip(example_queries, predicted_fields):
    print('Query:', query, '\
Predicted Field:', field, '\
')

Query: Who won the last NBA championship? Predicted Field: Sports 
Query: What are the health benefits of yoga? Predicted Field: Health 
Query: Benefits of eating good food? Predicted Field: Health 
Query: What are the latest trends in AI technology? Predicted Field: Trends 
Query: Tell me about the history of the Roman Empire Predicted Field: History 


In [5]:
import pickle

# Save the Logistic Regression model to a pickle file
model_filename = 'field_of_interest.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print('Model has been saved as:', model_filename)

Model has been saved as: field_of_interest.pkl
