In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('eclipse_jdt.csv')
df=df[['Title','Description','Priority']]
df=df.dropna()
df['text']= df['Title']+''+df['Description']
df=df.drop(columns=['Title','Description'])

In [3]:
import textacy
import textacy.preprocessing as tprep

preproc = tprep.make_pipeline(
    tprep.replace.urls,
    tprep.remove.html_tags,
    tprep.normalize.hyphenated_words,
    tprep.normalize.quotation_marks,
    tprep.normalize.unicode,
    tprep.remove.accents,
    tprep.remove.punctuation,
    tprep.normalize.whitespace,
    tprep.replace.numbers
)

In [4]:
df['text']=df['text'].apply(preproc)
df=df[df['text'].str.len()>50]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],df['Priority'], test_size=0.2,random_state=42,stratify=df['Priority'])

### SVC Models

In [12]:
from sklearn.svm import LinearSVC
model1 = LinearSVC(random_state=0,tol=1e-5)
model1.fit(X_train_tf,Y_train)

In [14]:
Y_pred = model1.predict(X_test_tf)

In [13]:
from sklearn.metrics import accuracy_score
print('Accuracy score', accuracy_score(Y_test, Y_pred))

Accuracy score 0.8778998778998779


In [11]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred,zero_division=0.0))

              precision    recall  f1-score   support

          P1       0.57      0.05      0.10       224
          P2       0.35      0.03      0.06       608
          P3       0.88      0.99      0.94      7899
          P4       0.51      0.15      0.24       228
          P5       0.00      0.00      0.00        50

    accuracy                           0.88      9009
   macro avg       0.46      0.25      0.27      9009
weighted avg       0.83      0.88      0.83      9009



In [None]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(Y_test,Y_pred,zero_division=0))

## Pipeline : CountVectorizer+TfidfVectorizer+LinearSVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

clf_text = Pipeline([
    ('vect', CountVectorizer(min_df=10,ngram_range=(1,2),stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('svc',LinearSVC(random_state=0,tol=1e-5,dual='auto'))
])
clf_text.named_steps

In [None]:
clf_text.fit(X_train,Y_train)

In [None]:
Y_pred=clf_text.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print('Accuracy score', accuracy_score(Y_test, Y_pred))

In [None]:
clf_text.fit(X_train,Y_train).score(Y_test,Y_pred)

## Pipeline : TfidfVectorizer+LinearSVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

clf_text = Pipeline([
    ('tfidf', TfidfVectorizer(use_idf=True,min_df=5,ngram_range=(1,2),stop_words='english')),
    ('svc',LinearSVC(random_state=0,tol=1e-5,dual='auto'))
])
clf_text

In [None]:
clf_text.fit(X_train,Y_train)
Y_pred=clf_text.predict(X_test)
from sklearn.metrics import accuracy_score
print('Accuracy score', accuracy_score(Y_test, Y_pred))


In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred,zero_division=0.0))

In [None]:
clf_text.fit(X_train,Y_train).score(Y_test,Y_pred)

In [None]:
count_vect