# Training a Sentiment Analysis model from [Kaggle data](https://www.kaggle.com/yash612/stockmarket-sentiment-dataset?select=stock_data.csv)

> Download the data and name it `stock_data.csv`

In [1]:
import os
import wandb

In [2]:
raw_csv_fname = "stock_data.csv"

In [3]:
wandb.login()
wandb.init(project="aws_demo", job_type="data_upload")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m (use `wandb login --relogin` to force relogin)


In [4]:
table = wandb.Table(columns=["Sequence", "Sentiment"])

In [5]:
ds_at = wandb.Artifact("raw_dataset", type="dataset")
ds_at.add_file(raw_csv_fname)

<ManifestEntry digest: 1PmYBFHgKJWQRIXSiKAQ9A==>

In [6]:
wandb.log_artifact(ds_at)
wandb.finish()

VBox(children=(Label(value=' 0.46MB of 0.46MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

## Process data
- rename columns
- Create split

In [7]:
wandb.init(project="aws_demo", job_type="preprocess_data")

In [8]:
dataset_path = wandb.use_artifact("raw_dataset:latest").download()

In [9]:
raw_csv_fname = os.path.join(dataset_path, raw_csv_fname)

In [10]:
import random
import pandas as pd

df = pd.read_csv(raw_csv_fname)

labels = ["negative", "positive"]
id2label = {-1: labels[0], 1: labels[1]}
label2id = { labels[0]:-1, labels[1]:1 }

df["labels"] = df["Sentiment"].map({-1:0, 1:1})

df = df.drop(columns=["Sentiment"])

In [11]:
def get_train_test_idxs(df, pct=0.1, seed=2022):
    "get train and valid idxs"
    random.seed(seed)
    range_of = lambda df: list(range(len(df)))
    test_idxs = random.sample(range_of(df), int(pct*len(df)))
    train_idxs = [i for i in range_of(df) if i not in test_idxs]
    return train_idxs, test_idxs

In [12]:
def save_datasets(df, pct=0.1):
    "Save splitted dataset"
    train_idxs, test_idxs = get_train_test_idxs(df, pct)
    train_df, test_df = df.loc[train_idxs], df.loc[test_idxs]
    print("Saving splitted dataset")
    train_df.to_csv("train.csv", index=False)
    test_df.to_csv("test.csv", index=False)

In [13]:
save_datasets(df)

Saving splitted dataset


In [14]:
split_at = wandb.Artifact("splitted_dataset", type="dataset")

# we add the files
split_at.add_file("train.csv")
split_at.add_file("test.csv")

# we log
wandb.log_artifact(split_at)

wandb.finish()

VBox(children=(Label(value=' 0.46MB of 0.46MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…