In [1]:
import os
import zipfile
from abc import ABC, abstractmethod
from src.ingest_data import *
import pandas as pd
import logging
from src.filter_text import *
from src.feature_engineering import *
from src.data_splitter import *

In [2]:
file_path = "data/archive.zip"

# Determine the file extension
file_extension = os.path.splitext(file_path)[1]

# Get the appropriate DataIngestor
data_ingestor = DataIngestorFactory.get_data_ingestor(file_extension)

# Ingest the data and load it into a DataFrame
df = data_ingestor.ingest(file_path)

In [3]:
text_filtering_handler = FilterTextHandler(FixDropTextStrategy(length=25))
df_cleaned = text_filtering_handler.handle_text_filtering(df)

2025-02-19 16:38:55,679 - INFO - Executing text filtering handling strategy.
2025-02-19 16:38:55,680 - INFO - Dropping news with word count (length) <= 25
2025-02-19 16:38:56,144 - INFO - News Filtered by length >= 25.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop(columns=['content_length'], inplace=True)


In [4]:
basic_preprocess = FeatureEngineer(BasicPreprocessText(feature='content'))
df = basic_preprocess.apply_feature_engineering(df_cleaned)

stopword_remover = FeatureEngineer(RemoveStopwords(feature='content'))
df = stopword_remover.apply_feature_engineering(df)

text_stemming = FeatureEngineer(TextStemming(feature='content'))
df = text_stemming.apply_feature_engineering(df)

2025-02-19 16:38:56,866 - INFO - Applying text preprocessing (basic cleaning) to text:
2025-02-19 16:39:05,695 - INFO - Basic Preprocessing completed.
2025-02-19 16:39:05,700 - INFO - Applying stopword removing to text:
2025-02-19 16:39:14,852 - INFO - Stopword Removing completed.
2025-02-19 16:39:14,905 - INFO - Applying Stemming to text:
2025-02-19 16:39:51,780 - INFO - Text Stemming completed.


In [5]:
data_splitter = DataSplitter(SimpleTrainTestSplitStrategy(test_size=0.2, random_state=42))
X_train, X_test, y_train, y_test = data_splitter.split(df, target_column=df.columns[17:])

2025-02-19 16:39:51,826 - INFO - Splitting data using the selected strategy.
2025-02-19 16:39:51,830 - INFO - Performing simple train-test split.
2025-02-19 16:39:51,868 - INFO - Train-test split completed.


## OneVsRestClassifier

In [None]:
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_curve, auc, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.calibration import CalibratedClassifierCV

In [45]:
def feature_importance(pipeline):
    '''
    Extract feature importances from pipeline. 
    Since I am using CalibratedClassifierCV I will average the coefficients over calibrated classifiers.
    
    https://www.kaggle.com/code/kobakhit/eda-and-multi-label-classification-for-arxiv#Preprocess-data
    '''
    # average coefficients over all calibrated classifiers
    coef_avg = 0
    classifiers = pipeline[1].estimators_[0].calibrated_classifiers_
    for i in classifiers:
        coef_avg = coef_avg + i.estimator.coef_
    coef_avg  = (coef_avg/len(classifiers)).tolist()[0]
    # get feature names from tf-idf vectorizer
    features = pipeline[0].get_feature_names_out()
    # get 10 most important features
    top_f = pd.DataFrame(list(zip(features,coef_avg)), columns = ['token','coef']) \
        .nlargest(10,'coef').to_dict(orient = 'records')
    return top_f

In [None]:
classifier = CalibratedClassifierCV(LinearSVC()) 
categories = y_train.columns

# for each category train the model and get accuracy, auc
models = {}
features = {}
preds = {}
for category in categories:
    # give pipelines unique names. important!  
    SVC_pipeline = Pipeline([
                (f'tfidf_{category}', TfidfVectorizer()),
                (f'clf_{category}', OneVsRestClassifier(classifier, n_jobs=1)),
            ])
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train['content'], y_train[category])
    models[category] = SVC_pipeline
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test['content'])
    preds[category] = prediction
    accuracy = accuracy_score(y_test[category], prediction)
    # compute auc
    probas_ = SVC_pipeline.predict_proba(X_test['content'])
    fpr, tpr, thresholds = roc_curve(y_test[category], probas_[:, 1])
    roc_auc = auc(fpr, tpr)
    print("Accuracy : {} . Area under the ROC curve : {}".format(round(accuracy,4), round(roc_auc,4)))
    print()
    # get most predictive features
    features[category] = feature_importance(SVC_pipeline)

... Processing crime, law and justice
Accuracy : 0.9756 . Area under the ROC curve : 0.9792

... Processing arts, culture, entertainment and media
Accuracy : 0.9618 . Area under the ROC curve : 0.9825

... Processing economy, business and finance
Accuracy : 0.9586 . Area under the ROC curve : 0.972

... Processing disaster, accident and emergency incident
Accuracy : 0.9847 . Area under the ROC curve : 0.9878

... Processing environment
Accuracy : 0.9866 . Area under the ROC curve : 0.9921

... Processing education
Accuracy : 0.9847 . Area under the ROC curve : 0.9955

... Processing health
Accuracy : 0.9695 . Area under the ROC curve : 0.9898

... Processing human interest
Accuracy : 0.9546 . Area under the ROC curve : 0.9688

... Processing lifestyle and leisure
Accuracy : 0.988 . Area under the ROC curve : 0.9918

... Processing politics
Accuracy : 0.932 . Area under the ROC curve : 0.9659

... Processing labour
Accuracy : 0.9851 . Area under the ROC curve : 0.9929

... Processing re

In [47]:
# 10 most important features by category
features_df = pd.DataFrame(features)
features_df.apply(lambda x: [d['token'] for d in x], axis=0)

Unnamed: 0,"crime, law and justice","arts, culture, entertainment and media","economy, business and finance","disaster, accident and emergency incident",environment,education,health,human interest,lifestyle and leisure,politics,labour,religion and belief,science and technology,society,sport,"conflict, war and peace",weather
0,law,entertain,compani,fire,environment,educ,health,ceremoni,tattoo,polit,retir,religi,scienc,immigr,sport,protest,storm
1,sentenc,cultur,economi,emerg,climat,school,hospit,award,workout,trump,job,church,research,chariti,stadium,war,weather
2,murder,film,market,crash,pollut,student,diseas,plant,exercis,govern,employe,mosqu,math,wed,footbal,militari,cyclon
3,polic,museum,econom,disast,insect,learn,treatment,birthday,garden,amnesti,worker,muslim,clinic,addict,athlet,coup,temperatur
4,court,art,custom,incid,spill,teacher,nhs,dog,game,googl,union,christian,test,fan,bodybuild,terrorist,snow
5,attorney,tradit,product,accid,speci,cours,medic,celebr,fit,elect,labour,belief,space,refuge,championship,syrian,warn
6,investig,movi,busi,evacu,wildlif,parent,care,medal,gym,polici,unemploy,islam,scientist,social,muscl,peac,flood
7,fbi,festiv,invest,road,environ,univers,patient,pet,you,right,employ,pope,use,societi,gym,attack,rain
8,convict,fashion,stock,damag,carbon,teach,healthcar,anniversari,board,minist,wage,religion,scientif,discrimin,player,unrest,typhoon
9,arrest,media,brand,wildfir,forest,colleg,nurs,kitten,classic,ban,work,franci,engin,racism,leagu,syria,met


In [49]:
def predict_tags(X, labels = None):
    '''
    Predict tags for a given abstract.
    
    Args:
      - X (list): an iterable with text.
      - labels (pandas.Dataframe): label indicators for an abstract
    '''
    preds = []
    if type(X) is str: # convert into iterable if string
        X = [X]
    
    # get prediction from each model
    for c in models.keys():
        preds.append(models[c].predict(X))
    
    # print original labels if given
    if labels is not None:
        assert len(X) == 1, 'Only one extract at a time.'
        predicted_tags = [k for k,v in zip(list(models.keys()),preds) if v[0] > 0]
        original_tags = list(labels.index[labels.map(lambda x: x>0)])
        print('Original Tags: {}'.format(str(original_tags)))
        print("Predicted Tags: {}".format(str(predicted_tags)))
        
    return preds

In [53]:
# get all predictions
y_pred = np.array(predict_tags(X_test['content'])).T

# get true labels in the same order
y_true = y_test[list(models.keys())].to_numpy()
y_true

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
"""
Hamming Loss is calculated by taking a fraction of the wrong prediction with the total number of labels. 
Because Hamming Loss is a loss function, the lower the score is, 
the better (0 indicates no wrong prediction and 1 indicates all the prediction is wrong)x
"""

In [54]:
from sklearn.metrics import hamming_loss
print('Hamming Loss: ', round(hamming_loss(y_test, y_pred),2))

Hamming Loss:  0.03


## MultiOutputClassifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier  

from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, average_precision_score

In [None]:
print('\nMultiOutputClassifier with Logistic Regression')
pipeline = Pipeline([
            (f'tfidf', TfidfVectorizer()),
            (f'clf', MultiOutputClassifier(LogisticRegression())),
        ])
pipeline.fit(X_train['content'], y_train)
prediction = pipeline.predict(X_test['content'])


accuracy = accuracy_score(y_test, prediction)
print("Accuracy : {}".format(round(accuracy,4)))

print('AUC score: {}'.format(roc_auc_score(y_test, prediction)))
print('Hamming Loss: ', round(hamming_loss(y_test, prediction),2))


MultiOutputClassifier with Logistic Regression
Accuracy : 0.4736
AUC score: 0.7787947142185324
Hamming Loss:  0.05


In [None]:
# 20 mins
print('\nMultiOutputClassifier with Gradient Boosting')
pipeline = Pipeline([
            (f'tfidf', TfidfVectorizer()),
            (f'clf', MultiOutputClassifier(GradientBoostingClassifier())),
        ])
pipeline.fit(X_train['content'], y_train)
prediction = pipeline.predict(X_test['content'])


accuracy = accuracy_score(y_test, prediction)
print("Accuracy : {}".format(round(accuracy,4)))

print('AUC score: {}'.format(roc_auc_score(y_test, prediction)))



MultiOutputClassifier with Gradient Boosting
Accuracy : 0.4533
AUC score: 0.8113809975400633


In [None]:
print('\nMultiOutputClassifier with LGBM Classifier')
pipeline = Pipeline([
            (f'tfidf', TfidfVectorizer()),
            (f'clf', MultiOutputClassifier(LGBMClassifier(learning_rate = 0.08, 
                                                          num_leaves = 35, 
                                                          n_estimators = 350, 
                                                          verbose=-1))),
        ])
pipeline.fit(X_train['content'], y_train)
prediction = pipeline.predict(X_test['content'])

accuracy = accuracy_score(y_test, prediction)
print("Accuracy : {}".format(round(accuracy,4)))

print('AUC score: {}'.format(roc_auc_score(y_test, prediction)))
print('Hamming Loss: ', round(hamming_loss(y_test, prediction),2))


MultiOutputClassifier with LGBM Classifier
Accuracy : 0.562
AUC score: 0.867721217987693
Hamming Loss:  0.04


In [None]:
"""
MultiOutputClassifier with LGBM Classifier
Accuracy : 0.562
AUC score: 0.8678111608425622
Hamming Loss:  0.04
learning_rate = 0.08, 
num_leaves = 35, 
n_estimators = 350, 
"""

## Deep Learning

https://colab.research.google.com/drive/125Q856ee3fKIQcZe0awHFUmyBIMX3PzQ?usp=sharing