In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
data=pd.read_csv('assignment_B.csv')

In [23]:
# combine zp, food,to miscellaneous tag
data['tags'] = data['tags'].replace(['zp', 'food'], 'miscellaneous')


In [24]:
#distribution of classes
data["tags"].value_counts()

work             382
emotional        347
money             99
medical           99
family            64
miscellaneous      9
Name: tags, dtype: int64

In [25]:
#pre-process sentences
def pre_process(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

data['proc_text'] = data['text'].map(pre_process)


In [26]:
# Remove any rows with missing data
data = data.dropna()

In [27]:
#Embed text into features
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_text = model.encode(data['proc_text'])

data['embeddings_text'] = list(embeddings_text)


In [67]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['encoded_tags']=label_encoder.fit_transform(data['tags'])


In [69]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = np.stack(data['embeddings_text'])
y = data['encoded_tags'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [71]:
smote = SMOTE(random_state=50)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train.reshape(-1, X_train.shape[-1]), y_train)

# Scale the input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)


In [114]:
from sklearn.svm import SVC

# Create an SVM model with a specified kernel (e.g., 'linear', 'poly', 'rbf', or 'sigmoid')
clf = SVC(kernel='rbf',degree=2, class_weight='balanced')

# Fit the model to the training data
clf.fit(X_train_scaled, y_train_balanced)

In [116]:
y_pred = clf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.72
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.78      0.75        69
           1       0.33      0.31      0.32        13
           2       0.64      0.45      0.53        20
           3       0.00      0.00      0.00         2
           4       0.68      0.65      0.67        20
           5       0.80      0.84      0.82        76

    accuracy                           0.72       200
   macro avg       0.53      0.51      0.51       200
weighted avg       0.71      0.72      0.71       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [131]:
# Assign weights based on the inverse of the class frequencies
class_weights = sum(y_train) / (len(set(y_train)) * np.bincount(y_train))
sample_weights = class_weights[y_train]

In [172]:
from sklearn.svm import LinearSVC

clf = LinearSVC(multi_class='ovr', C= 0.1,loss= 'squared_hinge',class_weight = 'balanced', fit_intercept= False)  # or multi_class='ovo'
clf.fit(X_train, y_train)

In [173]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.68
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.67      0.71        69
           1       0.22      0.31      0.26        13
           2       0.52      0.60      0.56        20
           3       0.50      0.50      0.50         2
           4       0.58      0.70      0.64        20
           5       0.81      0.78      0.79        76

    accuracy                           0.68       200
   macro avg       0.57      0.59      0.58       200
weighted avg       0.70      0.68      0.69       200



In [162]:

from sklearn.model_selection import GridSearchCV

# Create a LinearSVC model
svc = LinearSVC(multi_class='ovr')

# Define the hyperparameter search space
param_grid = {
    'C': np.logspace(-3, 3, 7),
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced'],
    'loss': ['hinge', 'squared_hinge'],
}

# Create a GridSearchCV object
grid_search = GridSearchCV(svc, param_grid, scoring='f1_weighted', cv=5, verbose=1, n_jobs=-1)

# Perform hyperparameter search and fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model with the best hyperparameters
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", acc)
print("Classification Report:\n", report)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Best Parameters: {'C': 0.1, 'class_weight': 'balanced', 'fit_intercept': False, 'loss': 'squared_hinge'}
Best Score: 0.7803669494568232
Accuracy: 0.68
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.67      0.71        69
           1       0.22      0.31      0.26        13
           2       0.52      0.60      0.56        20
           3       0.50      0.50      0.50         2
           4       0.58      0.70      0.64        20
           5       0.81      0.78      0.79        76

    accuracy                           0.68       200
   macro avg       0.57      0.59      0.58       200
weighted avg       0.70      0.68      0.69       200



In [158]:
from sklearn.metrics import SCORERS

available_scorers = list(SCORERS.keys())
available_scorers

['explained_variance',
 'r2',
 'max_error',
 'matthews_corrcoef',
 'neg_median_absolute_error',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_root_mean_squared_error',
 'neg_mean_poisson_deviance',
 'neg_mean_gamma_deviance',
 'accuracy',
 'top_k_accuracy',
 'roc_auc',
 'roc_auc_ovr',
 'roc_auc_ovo',
 'roc_auc_ovr_weighted',
 'roc_auc_ovo_weighted',
 'balanced_accuracy',
 'average_precision',
 'neg_log_loss',
 'neg_brier_score',
 'adjusted_rand_score',
 'rand_score',
 'homogeneity_score',
 'completeness_score',
 'v_measure_score',
 'mutual_info_score',
 'adjusted_mutual_info_score',
 'normalized_mutual_info_score',
 'fowlkes_mallows_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'jaccard',
 'jacca

In [145]:
import xgboost as xgb

# Create an XGBoost classifier
clf = xgb.XGBClassifier(objective="multi:softmax", num_class=6) 
# Train the classifier
clf.fit(X_train, y_train, sample_weight=sample_weights)

# Predict labels for the test set
y_pred = clf.predict(X_test)

In [146]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.675
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        69
           1       0.25      0.23      0.24        13
           2       0.58      0.55      0.56        20
           3       0.00      0.00      0.00         2
           4       0.70      0.70      0.70        20
           5       0.74      0.78      0.76        76

    accuracy                           0.68       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.66      0.68      0.67       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [149]:
def predict_topic(text):
    preprocessed_text = pre_process(text)
    X_new = model.encode(preprocessed_text)
    X_new_scaled = scaler.transform([X_new])
    return clf.predict([X_new])

input_text = "financial loss, unable to survive"
predicted_label = predict_topic(input_text)
print("Predicted Label:", label_encoder.inverse_transform(predicted_label))

input_text = "hectic work unbearable stress"
predicted_label = predict_topic(input_text)
print("Predicted Label:", label_encoder.inverse_transform(predicted_label))

Predicted Label: ['money']
Predicted Label: ['medical']
