In [58]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [59]:
# Load the dataset
data = pd.read_csv('/content/oversampleddata.csv')


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9737 entries, 0 to 9736
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    9737 non-null   int64 
 1   Text Content  9737 non-null   object
 2   Code          9737 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.3+ KB


In [61]:
data.dropna(inplace=True)


In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9737 entries, 0 to 9736
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    9737 non-null   int64 
 1   Text Content  9737 non-null   object
 2   Code          9737 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.3+ KB


In [63]:
import nltk


In [64]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [65]:
# Perform stemming
stemmer = PorterStemmer()
data['stemmed_text'] = data['Text Content'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(str(x))]))

In [66]:
# Define the feature vector and the target variable
X = data['stemmed_text']
y = data['Code']

In [67]:
# Define the pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('logreg', LogisticRegression(max_iter=1000, solver='lbfgs'))
])

In [68]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'logreg__C': [0.1, 1, 10]
}

In [69]:
# Perform nested cross-validation with hyperparameter tuning
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=outer_cv, scoring='accuracy')
grid_search.fit(X, y)

In [70]:
# Print the best hyperparameters
print("Best Hyperparameters:")
print(grid_search.best_params_)
print()

Best Hyperparameters:
{'logreg__C': 10, 'tfidf__max_df': 0.8}



In [71]:
# Print the classification report
print("Classification Report:")
cv_results = cross_val_score(grid_search.best_estimator_, X, y, cv=outer_cv, scoring='accuracy')
y_pred = grid_search.best_estimator_.predict(X)
print(classification_report(y, y_pred))
print()

Classification Report:
                                   precision    recall  f1-score   support

                  Action on Issue       1.00      1.00      1.00       749
                 Bug Reproduction       0.98      0.96      0.97       749
      Contribution and Commitment       0.98      1.00      0.99       749
               Expected Behaviour       0.96      0.99      0.98       749
    Investigation and Exploration       0.99      0.97      0.98       749
                       Motivation       1.00      0.99      0.99       749
           Observed Bug Behaviour       0.99      0.97      0.98       749
Potential New Issues and Requests       1.00      1.00      1.00       749
              Social Conversation       0.97      0.98      0.98       749
               Social Convesation       0.99      0.97      0.98       749
                    Task Progress       1.00      0.98      0.99       749
                            Usage       0.94      1.00      0.97       749
 

In [72]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [73]:
# Train the model on the training set
grid_search.best_estimator_.fit(X_train, y_train)

In [74]:
# Evaluate the model on the testing set
test_accuracy = grid_search.best_estimator_.score(X_test, y_test)
print("Test Data Accuracy:", test_accuracy)

Test Data Accuracy: 0.8475359342915811


In [75]:
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model on the training set
grid_search.best_estimator_.fit(X_train, y_train)

# Predict the labels for the test set
y_pred_test = grid_search.best_estimator_.predict(X_test)

# Print the classification report on the test dataset
print("Classification Report (Test Dataset):")
print(classification_report(y_test, y_pred_test))


Classification Report (Test Dataset):
                                   precision    recall  f1-score   support

                  Action on Issue       1.00      1.00      1.00       146
                 Bug Reproduction       0.75      0.66      0.70       145
      Contribution and Commitment       0.92      0.80      0.86       173
               Expected Behaviour       0.88      0.99      0.93       155
    Investigation and Exploration       0.63      0.47      0.54       152
                       Motivation       0.75      0.93      0.83       142
           Observed Bug Behaviour       0.88      0.89      0.88       149
Potential New Issues and Requests       0.86      0.90      0.88       141
              Social Conversation       0.76      0.68      0.72       152
               Social Convesation       0.89      0.93      0.91       147
                    Task Progress       0.88      0.94      0.91       160
                            Usage       0.80      0.92      0