In [0]:
import re
import os
import nltk
import mlflow
import pandas as pd

from utils.data import retrieve_data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt_tab')

ARTIFACTS_PATH = "artifacts"

In [0]:
"""
run_id = dbutils.widgets.get("run_id")
data_name = dbutils.widgets.get("data_name")
data_version = dbutils.widgets.get("data_version")
"""

In [0]:
run_id = "7d5de6f35edb47a9985241296d448baf"
data_name = "sentiment_analysis"
data_version = "1.0.0"

In [0]:
df = retrieve_data(name=data_name, version=data_version)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

df = df.dropna()
df = df[df['Sentiment'] != 'neutral']
df['preprocessed_text'] = df['Sentence'].apply(preprocess_text)

enc = LabelEncoder()
df['encoded_sentiment'] = enc.fit_transform(df['Sentiment'])

holdout = df[df['split'] == 'holdout']
tfidf = TfidfVectorizer(max_features=100).fit(df['preprocessed_text'])

In [0]:
logged_model = os.path.join('runs:', run_id, ARTIFACTS_PATH)

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
holdout['prediction'] = loaded_model.predict(
    tfidf.transform(holdout['preprocessed_text']))


holdout.to_csv(f'/Workspace/inference/{run_id}_{data_name}_{data_version}.csv', index=False)