In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv("../dataset/processed/tickets_cleaned.csv")


In [4]:

df.shape

(28587, 17)

In [5]:
df[['clean_text', 'type', 'priority']].head()

Unnamed: 0,clean_text,type,priority
0,sehr geehrtes supportteamnnich mchte einen gra...,Incident,high
1,dear customer support teamnni writing report s...,Incident,high
2,dear customer support teamnni hope message rea...,Request,medium
3,dear customer support teamnni hope message fin...,Request,low
4,dear support teamnni hope message reach well r...,Problem,medium


In [6]:
df = df[df['language'] == 'en']

In [7]:
X = df['clean_text']
y_cat = df['type']

X_train, X_test, y_train_cat, y_test_cat = train_test_split(
    X,
    y_cat,
    test_size=0.2,
    random_state=42,
    stratify=y_cat
)


In [8]:
tfidf_cat = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)

X_train_tfidf = tfidf_cat.fit_transform(X_train)
X_test_tfidf = tfidf_cat.transform(X_test)


In [9]:
cat_model = LinearSVC(class_weight='balanced')
cat_model.fit(X_train_tfidf, y_train_cat)


0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",'balanced'
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0


In [10]:
y_pred_cat = cat_model.predict(X_test_tfidf)
accuracy_score(y_test_cat, y_pred_cat)


0.8647490820073439

In [11]:
cm_cat = confusion_matrix(y_test_cat, y_pred_cat)
labels_cat = cat_model.classes_

cat_accuracy = {
    labels_cat[i]: cm_cat[i, i] / cm_cat[i].sum()
    for i in range(len(labels_cat))
}

pd.DataFrame.from_dict(
    cat_accuracy,
    orient="index",
    columns=["Category Accuracy"]
)


Unnamed: 0,Category Accuracy
Change,0.979472
Incident,0.840183
Problem,0.672059
Request,0.997856


In [12]:
print(classification_report(y_test_cat, y_pred_cat))


              precision    recall  f1-score   support

      Change       0.99      0.98      0.98       341
    Incident       0.83      0.84      0.84      1314
     Problem       0.69      0.67      0.68       680
     Request       0.99      1.00      0.99       933

    accuracy                           0.86      3268
   macro avg       0.87      0.87      0.87      3268
weighted avg       0.86      0.86      0.86      3268



In [13]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y_pr = df['priority']

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X,
    y_pr,
    test_size=0.2,
    random_state=42,
    stratify=y_pr
)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_pr = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True
)

X_train_p_tfidf = tfidf_pr.fit_transform(X_train_p)
X_test_p_tfidf = tfidf_pr.transform(X_test_p)


In [15]:
from sklearn.svm import LinearSVC

priority_model = LinearSVC(class_weight='balanced')
priority_model.fit(X_train_p_tfidf, y_train_p)


0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",'balanced'
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0


In [16]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_pr = priority_model.predict(X_test_p_tfidf)

print("Accuracy:", accuracy_score(y_test_p, y_pred_pr))
print(classification_report(y_test_p, y_pred_pr))


Accuracy: 0.6940024479804161
              precision    recall  f1-score   support

        high       0.72      0.73      0.73      1269
         low       0.63      0.61      0.62       675
      medium       0.70      0.70      0.70      1324

    accuracy                           0.69      3268
   macro avg       0.68      0.68      0.68      3268
weighted avg       0.69      0.69      0.69      3268



In [17]:
df['priority'].value_counts()


priority
medium    6618
high      6346
low       3374
Name: count, dtype: int64

In [18]:
df['priority'].unique()


<StringArray>
['high', 'medium', 'low']
Length: 3, dtype: str

In [19]:
accuracy_score(y_test_p, y_pred_pr)


0.6940024479804161

In [20]:
import joblib

# Save category model
joblib.dump(cat_model, "../models/category_model.pkl")
joblib.dump(tfidf_cat, "../models/tfidf_category.pkl")


joblib.dump(priority_model, "../models/priority_model.pkl")
joblib.dump(tfidf_pr, "../models/tfidf_priority.pkl")



['../models/tfidf_priority.pkl']

In [21]:
import os
os.listdir("../models")


['category_model.pkl',
 'priority_model.pkl',
 'tfidf_category.pkl',
 'tfidf_priority.pkl',
 'tfidf_vectorizer.pkl',
 'ticket_type_model.pkl']

In [22]:
import joblib
import os

# Create models folder inside backend if not exists
os.makedirs("../backend/models", exist_ok=True)

joblib.dump(category_model, "../backend/models/category_model.pkl")
joblib.dump(priority_model, "../backend/models/priority_model.pkl")
joblib.dump(tfidf_cat, "../backend/models/tfidf_category.pkl")
joblib.dump(tfidf_pr, "../backend/models/tfidf_priority.pkl")

print("Models saved successfully!")


NameError: name 'category_model' is not defined