In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
import os
import sys

sys.path.append(os.path.dirname(os.getcwd()))


In [0]:
# import required libraries
import re
import nltk
from ast import literal_eval
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# import required local utils
from utils.models import ModelPipeline
from utils.data import DataPipeline, generate_data

# required downloads
nltk.download('stopwords')
nltk.download('punkt_tab')

In [0]:
# Getting parameters for data code
data_params = {
    'name': dbutils.widgets.get('name'),
    'version': dbutils.widgets.get('version'),
    'file_path': dbutils.widgets.get('file_path'),
    'holdout': dbutils.widgets.get('holdout'),
    'test_size': float(dbutils.widgets.get('test_size')),
    'holdout_size': float(dbutils.widgets.get('holdout_size')),
    'stratify': literal_eval(dbutils.widgets.get('stratify'))
}

In [0]:
"""
data_params = {
    'file_path': '/Workspace/data/data.csv',
    'name': 'sentiment_analysis',
    'version': '1.0.0',
    'holdout': True,
    'test_size': 0.2,
    'holdout_size': 0.5,
    'stratify': ['Sentiment']
}
"""

In [0]:
data_obj = generate_data(**data_params)

In [0]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

data_obj.df = data_obj.df.dropna()
data_obj.df = data_obj.df[data_obj.df['Sentiment'] != 'neutral']
data_obj.df['preprocessed_text'] = data_obj.df['Sentence'].apply(preprocess_text)

enc = LabelEncoder()
data_obj.df['encoded_sentiment'] = enc.fit_transform(data_obj.df['Sentiment'])


data_obj.df

In [0]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_vectorizer.fit(data_obj.df['Sentence'])

In [0]:
train = data_obj.get_train_data()
test = data_obj.get_test_data()
holdout = data_obj.get_holdout_data()

In [0]:
model = ModelPipeline(
   model_name = data_params['name'],
   model_type = data_params['model_type'],
   params = {
      'fit_prior': True,
      'alpha': 1.0,
      'class_prior': [0.6, 0.4]
   }
)

In [0]:
train, test = data_obj.get_train_data(), data_obj.get_test_data()

In [0]:
model.train(
    tfidf_vectorizer.transform(train['preprocessed_text']),
    train['encoded_sentiment'],
    tfidf_vectorizer.transform(test['preprocessed_text']),
    test['encoded_sentiment'])