In [12]:
import pandas as pd
from nltk.tokenize import word_tokenize

In [4]:
train = pd.read_json("D:\skills_assessment_data/train.json")
test = pd.read_json("D:\skills_assessment_data/test.json")

In [8]:
train.label.value_counts()

label
1    12500
0    12500
Name: count, dtype: int64

In [10]:
train.head(2)

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1


In [15]:
import nltk
import re

train["text"] = train["text"].str.lower()
test["text"] = test["text"].str.lower()
train["text"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))
test["text"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))

train["text"] = train["text"].apply(word_tokenize)
test["text"] = test["text"].apply(word_tokenize)

In [17]:
from nltk.corpus import stopwords

# Define a set of English stop words and remove them from the tokens
stop_words = set(stopwords.words("english"))
train["text"] = train["text"].apply(lambda x: [word for word in x if word not in stop_words])
test["text"] = test["text"].apply(lambda x: [word for word in x if word not in stop_words])

In [20]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
train["text"] = train["text"].apply(lambda x: [stemmer.stem(word) for word in x])
test["text"] = test["text"].apply(lambda x: [stemmer.stem(word) for word in x])

In [21]:
train["text"] = train["text"].apply(lambda x: " ".join(x))
test["text"] = test["text"].apply(lambda x: " ".join(x))

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))

# Fit and transform the message column
X = vectorizer.fit_transform(train["text"])

# Labels (target variable)
y = train["label"]

In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Build the pipeline by combining vectorization and classification
pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", MultinomialNB())
])

In [25]:
param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0]
}

# Perform the grid search with 5-fold cross-validation and the F1-score as metric
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1"
)

# Fit the grid search on the full dataset
grid_search.fit(train["text"], y)

# Extract the best model identified by the grid search
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)

Best model parameters: {'classifier__alpha': 1.0}


## evaluation

In [27]:
X_new = best_model.named_steps["vectorizer"].transform(test["text"])

In [28]:
predictions = best_model.named_steps["classifier"].predict(X_new)
prediction_probabilities = best_model.named_steps["classifier"].predict_proba(X_new)

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(test.label, predictions)


0.849

## Saving the model

In [33]:
import joblib

# Save the trained model to a file for future use
model_filename = 'skills_assessment.joblib'
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")

Model saved to skills_assessment.joblib
