In [1]:
# mlflow ui --backend-store-uri 'file:/Users/kiyas/Library/Application Support/zenml/local_stores/b043a600-6573-4111-9f6d-5ec1cd2a0363/mlruns'
import pandas as pd
import sklearn
import numpy as np
import warnings
from sklearn.exceptions import InconsistentVersionWarning

warnings.filterwarnings("ignore", category=InconsistentVersionWarning)

In [2]:
import pickle

# Load the pickle file
with open('/Users/kiyas/Library/Application Support/zenml/local_stores/b043a600-6573-4111-9f6d-5ec1cd2a0363/model_building_step/sklearn_pipeline/060e62c2-9c80-4aab-a673-a85d868a8a15/b6b24df6/artifact.pkl', 'rb') as file:
    pipeline = pickle.load(file)

print(pipeline)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 MultiOutputClassifier(estimator=LGBMClassifier(learning_rate=0.08,
                                                                n_estimators=350,
                                                                num_leaves=35,
                                                                verbose=-1)))])


In [3]:
N = 10
vectorizer = pipeline.named_steps['tfidf']
feature_names = vectorizer.get_feature_names_out()

categories = ['crime, law and justice', 'arts, culture, entertainment and media',
       'economy, business and finance',
       'disaster, accident and emergency incident', 'environment',
       'education', 'health', 'human interest', 'lifestyle and leisure',
       'politics', 'labour', 'religion and belief',
       'science and technology', 'society', 'sport',
       'conflict, war and peace', 'weather']

feature_importance_dict = {}

for i, category in enumerate(categories):
    lgbm_model = pipeline.named_steps['clf'].estimators_[i]  # Get LGBM for this category
    importance = lgbm_model.feature_importances_
    top_n_idx = np.argsort(-importance)[:N]  # Get indices of top N important features
    feature_importance_dict[category] = [feature_names[idx] for idx in top_n_idx]

# Convert to DataFrame for readability
df = pd.DataFrame(feature_importance_dict).T
df.columns = [f"{i+1}" for i in range(df.shape[1])]
df = df.T

In [5]:
df

Unnamed: 0,"crime, law and justice","arts, culture, entertainment and media","economy, business and finance","disaster, accident and emergency incident",environment,education,health,human interest,lifestyle and leisure,politics,labour,religion and belief,science and technology,society,sport,"conflict, war and peace",weather
1,law,entertain,compani,fire,climat,educ,health,plant,game,polit,job,religi,scienc,immigr,sport,protest,weather
2,polic,cultur,economi,emerg,environment,school,hospit,ceremoni,garden,trump,retir,church,research,fan,stadium,war,storm
3,murder,tradit,market,crash,pollut,student,diseas,award,exercis,govern,employe,muslim,clinic,school,gym,coup,flood
4,sentenc,movi,econom,incid,spill,parent,medic,birthday,tattoo,polici,work,mosqu,math,peopl,footbal,terror,rain
5,court,th,custom,disast,environ,univers,treatment,celebr,workout,elect,labour,christian,said,wed,player,terrorist,met
6,justic,news,product,accid,forest,learn,patient,dog,said,religi,union,educ,use,trump,athlet,attack,forecast
7,investig,art,stock,also,wildlif,build,nhs,anniversari,fit,right,wage,religion,space,chariti,leagu,syrian,wind
8,crime,museum,growth,wildfir,insect,cours,virus,day,gym,googl,worker,jewish,scientif,addict,muscl,militari,across
9,offic,film,ceo,damag,speci,teacher,care,pet,you,worker,employ,pope,educ,societi,bodybuild,unrest,temperatur
10,innoc,game,show,road,emiss,said,healthcar,garden,classic,vote,unemploy,islam,test,refuge,championship,peac,warn


In [6]:
df.to_csv('model_feature_importance.csv', index=False)

In [None]:
import joblib 
# compress to a smaller size
joblib.dump(pipeline, "model.joblib", compress=3)

['model.joblib']