In [1]:
# Step 1: Data Exploration and Preprocessing
import pandas as pd

In [2]:
# Load the training data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
print(train_data.head())

   unique ID                                         query_text  \
0          0                       HTTPS://VIMEO.COM/107297364﻿   
1          1                                    Art and Culture   
2          2  Differentiate between chemical and biological ...   
3          3                                  nth Term of an AP   
4          4                 bunmei kaika: aoiza ibunroku saien   

            category  
0               junk  
1            general  
2  academic_servable  
3  academic_servable  
4               junk  


In [4]:
print(test_data.head())

   unique ID                                         query_text
0          0                                                yes
1          1  If x = a 2 b 3 c 4 and y = a b 4 c 2 find the ...
2          2                                x + 7y=10;3x - 2y=7
3          3  Given tan β = cos θ tan α \[ \begin{array}{l} ...
4          4   Identify the chiral molecule from the following.


In [5]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   unique ID   5000 non-null   int64 
 1   query_text  5000 non-null   object
 2   category    5000 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB
None


In [6]:
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   unique ID   1000 non-null   int64 
 1   query_text  1000 non-null   object
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
None


In [7]:
# Check for missing values
print(train_data.isnull().sum())

unique ID     0
query_text    0
category      0
dtype: int64


In [8]:
# Check for missing values
print(test_data.isnull().sum())

unique ID     0
query_text    0
dtype: int64


In [9]:
# Explore the distribution of categories
print(train_data['category'].value_counts())

category
academic_servable        1750
junk                     1000
general                  1000
academic_non_servable     750
conversational            500
Name: count, dtype: int64


In [14]:
#Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Text preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

In [13]:
# Apply text preprocessing to the query_text column
train_data['clean_text'] = train_data['query_text'].apply(preprocess_text)

In [15]:
# Feature Engineering
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(train_data['clean_text'])

In [18]:
# Model Building
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, train_data['category'], test_size=0.2, random_state=42)


In [20]:
# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [21]:
# Make predictions
y_pred = nb_model.predict(X_test)

In [22]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.632
                       precision    recall  f1-score   support

academic_non_servable       0.89      0.52      0.65       141
    academic_servable       0.51      0.95      0.67       332
       conversational       0.92      0.32      0.47       103
              general       0.82      0.62      0.71       229
                 junk       0.72      0.35      0.47       195

             accuracy                           0.63      1000
            macro avg       0.77      0.55      0.60      1000
         weighted avg       0.72      0.63      0.62      1000



In [23]:
# Assuming 'test_data' is the test set with 'unique ID' and 'query_text' columns
test_data['clean_text'] = test_data['query_text'].apply(preprocess_text)
X_submission = tfidf_vectorizer.transform(test_data['clean_text'])
submission_predictions = nb_model.predict(X_submission)

In [24]:
submission = pd.DataFrame({'unique ID': test_data['unique ID'], 'category': submission_predictions})
submission.to_csv('submission.csv', index=False)