In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
import os
import sys

sys.path.append(os.path.dirname(os.getcwd()))


In [0]:
# import required libraries
import re
import nltk
from ast import literal_eval
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# import required local utils
from utils.models import ModelPipeline
from utils.data import DataPipeline, retrieve_data

# required downloads
nltk.download('stopwords')
nltk.download('punkt_tab')

In [0]:
# Getting parameters for data code
data_params = {
    'name': dbutils.widgets.get('data_name'),
    'version': dbutils.widgets.get('data_version'),
}

model_params = {
    'model_experiment_name': dbutils.widgets.get('model_experiment_name'),
    'model_type': dbutils.widgets.get('model_type'),
    'model_params': literal_eval(dbutils.widgets.get('model_params'))
}

In [0]:
df = retrieve_data(**data_params)

In [0]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

df = df.dropna()
df = df[data_obj.df['Sentiment'] != 'neutral']
df['preprocessed_text'] = df['Sentence'].apply(preprocess_text)

enc = LabelEncoder()
df['encoded_sentiment'] = enc.fit_transform(data_obj.df['Sentiment'])

df

In [0]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_vectorizer.fit(df['Sentence'])

In [0]:

train = df[df['split'] == 'train']
test = df[df['split'] == 'test']

In [0]:
model = ModelPipeline(**model_params)

In [0]:
model.train(
    tfidf_vectorizer.transform(train['preprocessed_text']),
    train['encoded_sentiment'],
    tfidf_vectorizer.transform(test['preprocessed_text']),
    test['encoded_sentiment'])