In [26]:
# Cell 1: Import Libraries
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# Download necessary NLTK data
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
# Cell 2: Load Dataset
df = pd.read_csv('software_requirements_extended.csv')

In [28]:
# Cell 3: Text Cleaning in Simple Lines
# Prepare stopwords
stop_words = set(stopwords.words('english'))

# Remove punctuation and stopwords from the 'Requirement' column directly
df['cleaned_requirement'] = df['Requirement'].apply(
    lambda text: ' '.join([word for word in ''.join([char for char in text if char not in string.punctuation]).split() if word.lower() not in stop_words])
)


In [29]:

# Cell 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_requirement'], df['Type'], test_size=0.2, random_state=45)
            


In [30]:
# Cell 05 
# Create a CountVectorizer to convert words into numeric values
bow_transformer = CountVectorizer().fit(X_train)

# Transform the training data into bag-of-words representation
X_train_bow = bow_transformer.transform(X_train)

# Transform the testing data into bag-of-words representation
X_test_bow = bow_transformer.transform(X_test)


In [31]:
# Cell 06 # Use TF-IDF transformer to convert bag-of-words to TF-IDF representation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_bow)
X_test_tfidf = tfidf_transformer.transform(X_test_bow)  # Make sure this line is present


In [32]:
# Cell 07  Train the Naive Bayes model on training data
model = MultinomialNB().fit(X_train_tfidf, y_train)


In [33]:
# Cell 08 Predict on test data
predictions = model.predict(X_test_tfidf)

# Display the predictions and actual values
print("Predictions: ", predictions)
print("Actual: ", y_test.values)

# Optional: Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")



Predictions:  ['FR' 'F' 'F' 'FR' 'F' 'FR' 'F' 'F' 'F' 'FR' 'FR' 'F' 'FR' 'F' 'F' 'FR'
 'FR' 'NFR' 'FR' 'F' 'FR' 'FR' 'F' 'FR' 'FR' 'F' 'FR' 'FR' 'F' 'F' 'F' 'F'
 'F' 'F' 'F' 'F' 'FR' 'PE' 'FR' 'FR' 'F' 'FR' 'F' 'FR' 'NFR' 'FR' 'F' 'FR'
 'F' 'F' 'FR' 'FR' 'FR' 'FR' 'F' 'F' 'F' 'FR' 'F' 'FR' 'F' 'FR' 'FR' 'F'
 'FR' 'FR' 'F' 'F' 'FR' 'F' 'FR' 'FR' 'F' 'FR' 'FR' 'FR' 'F' 'F' 'F' 'NFR'
 'FR' 'FR' 'F' 'FR' 'F' 'F' 'FR' 'F' 'F' 'F' 'F' 'FR' 'FR' 'FR' 'F' 'FR'
 'FR' 'F' 'FR' 'F' 'FR' 'F' 'F' 'FR' 'FR' 'F' 'F' 'FR' 'FR' 'FR' 'FR' 'FR'
 'FR' 'FR' 'F' 'FR' 'FR' 'FR' 'FR' 'F' 'F' 'F' 'FR' 'F' 'F' 'F' 'F' 'F'
 'FR' 'FR' 'F' 'F' 'F' 'F' 'FR' 'F' 'F' 'F' 'F' 'F' 'FR' 'FR' 'FR' 'F'
 'FR' 'FR' 'F' 'NFR' 'FR' 'FR' 'FR' 'F' 'FR' 'F' 'US' 'F' 'F' 'FR' 'FR'
 'F' 'F' 'FR' 'F' 'F' 'FR' 'F' 'NFR' 'FR' 'F' 'F' 'F' 'FR' 'FR' 'F' 'F'
 'FR' 'FR' 'FR' 'FR' 'F' 'F' 'FR' 'F' 'FR' 'F' 'FR' 'FR' 'FR' 'F' 'F' 'FR'
 'F' 'NFR' 'FR' 'FR' 'F']
Actual:  ['FR' 'US' 'LF' 'FR' 'SC' 'FR' 'F' 'F' 'F' 'FR' 'F' 'F' 'FR' 'SE' 'LF'
