In [None]:
from sys import platform
import pandas as pd

from src.data.make_dataset import main as make_dataset
from src.models.sentiment_analysis.training_pipeline import train_models, find_best_model
from src.models.sentiment_analysis.log_reg import LogReg
from src.models.sentiment_analysis.xg_boost import XgBoost
from src.models.sentiment_analysis.xg_boost_svd import XgBoostSvd
from src.models.sentiment_analysis.LSTM import BasicLSTM

# Sentiment Analysis

## Data Preprocessing and Feature Engineering
make_dataset function takes in the raw data frame, and two output file paths.
make_dataset makes use of our Preprocessor and FeatureEngineer classes to process the data, performs a train-test split, and returns the respective X/y train and test sets.
The train and test sets are also written as csv files so that we can run make_dataset once and load the processed data should we need it again.

### Preprocessing

Elaborate on preprocessing steps

### Feature Engineering

Elaborate on new features generated


In [None]:
# Create the processed dataset
data = pd.read_csv("../../data/raw/reviews.csv")
data_wo_sentiment = pd.read_csv("../../data/raw/reviews_test.csv")
X_train, X_test, y_train, y_test = make_dataset(
        data,
        train_split_output_filepath="../../data/processed/train_final_processed_reviews.csv",
        test_split_output_filepath="../../data/processed/test_final_processed_reviews.csv",
    )

In [None]:
# Load the processed dataset
train = pd.read_csv("../../data/processed/train_final_processed_reviews.csv", index_col="Unnamed: 0")
test = pd.read_csv("../../data/processed/test_final_processed_reviews.csv", index_col="Unnamed: 0")
X_train = train.drop("sentiment", axis=1)
X_test = test.drop("sentiment", axis=1)
y_train = train.sentiment.tolist()
y_test = test.sentiment.tolist()

## Training Pipeline
- Initialise all models and store in dict
- Train all models on same X_train
- Save (pickle) the trained models
- Evaluate models
    - Load saved models
    - Predict on same X_test
    - Calculate metrics
- Select best model

### Models Used

#### XgBoost
Explain about model

#### Add more models




In [None]:
if platform == "win32":
    models_path = "..\\..\\models\\sentiment_analysis"
else:
    models_path = "../../models/sentiment_analysis"
models = {
    "xg_boost": XgBoost(models_path),
    "xg_boost_svd": XgBoostSvd(models_path),
    "log_reg": LogReg(models_path),
    "LSTM": BasicLSTM(models_path)
    # Add other model instances here
}

# Train the models and save them
train_models(models, X_train, y_train, models_path)
best_model, best_model_name, best_accuracy = find_best_model(models, models_path, X_test, y_test)
print(f"Best model: {best_model_name}")
print(f"Accuracy: {best_accuracy}")

In [None]:
def run_scoring_pipeline(dataset_wo_sentiment):
    """Run the scoring pipeline."""
    start = time.time()
    preprocessor = Preprocessor(input_df)
    preprocessor.clean_test_csv()
    pre_processed_df = preprocessor.clean_df
    feature_engineer = FeatureEngineer(pre_processed_df)
    feature_engineer.add_features()
    feature_engineered_df = feature_engineer.feature_engineered_df
    fe_end = time.time()
    total_time_fe = fe_end - start
    print("\n" + "Preprocessing and Feature Engineering finished in " + str(round(total_time_fe)) + "s")

    time_col = feature_engineered_df.time
    X_test = feature_engineered_df.drop(["time"], axis=1)

    model.load(best_model)
    pred = model.predict(X_test)
    end = time.time()
    total_time = end - start

    # The output file should be named "reviews_test_predictions_<your_group_name>.csv ,
    # and it should have columns - "Text", Time", "predicted_sentiment_probability", "predicted_sentiment".

    output = pd.DataFrame(
        {
            "Text": X_test.text,
            "Time": time_col,
            "predicted_sentiment_probability": pred["predicted_sentiment_probability"],
            "predicted_sentiment": pred["predicted_sentiment"],
        }
    )

    output.to_csv("reviews_test_predictions_data-dialogue.csv", index=False)

    return output