## Data Preprocessing

In [1]:
import pandas as pd
import pickle
import torch

from utils.helpers import Utility

In [2]:
train_df = pd.read_csv("data/train.csv", skiprows=[0], header=[0, 1])

In [3]:
valid_df = pd.read_csv("data/valid.csv")

In [4]:
test_df = pd.read_csv("data/test.csv")

In [5]:
train_df.columns = ['review', 'label']

# train_df

In [6]:
valid_df.columns = ['review', 'label']

# valid_df

In [7]:
test_df.columns = ['review', 'label']

# test_df

In [8]:
train_reviews = train_df['review']
train_labels = train_df['label']

valid_reviews = valid_df['review']
valid_labels = valid_df['label']

test_reviews = test_df['review']
test_labels = test_df['label']

In [9]:
tn_reviews = Utility.escape_special_characters(train_reviews)
vd_reviews = Utility.escape_special_characters(valid_reviews)
tt_reviews = Utility.escape_special_characters(test_reviews)

In [10]:
review_vocab = Utility.generate_review_vocab(tn_reviews)

# review_vocab

In [11]:
label_vocab = Utility.generate_label_vocab(train_labels)

label_vocab

[0, 1]

In [12]:
tn_review_tensor = Utility.generate_review_tensor(tn_reviews, review_vocab)
vd_review_tensor = Utility.generate_review_tensor(vd_reviews, review_vocab)
tt_review_tensor = Utility.generate_review_tensor(tt_reviews, review_vocab)

100%|██████████| 18522/18522 [43:58<00:00,  7.02it/s] 
100%|██████████| 5000/5000 [11:49<00:00,  7.04it/s]
100%|██████████| 5000/5000 [11:58<00:00,  6.96it/s]


In [13]:
tn_label_tensor = Utility.pandas_series_to_torch_tensor(train_labels)
vd_label_tensor = Utility.pandas_series_to_torch_tensor(valid_labels)
tt_label_tensor = Utility.pandas_series_to_torch_tensor(test_labels)

In [14]:
state_dict = {
    'tn_reviews': tn_reviews,
    'vd_reviews': vd_reviews,
    'tt_reviews': tt_reviews,
    'tn_review_tensor': tn_review_tensor,
    'vd_review_tensor': vd_review_tensor,
    'tt_review_tensor': tt_review_tensor,
    'tn_label_tensor': tn_label_tensor,
    'vd_label_tensor': vd_label_tensor,
    'tt_label_tensor': tt_label_tensor,
    'review_vocab': review_vocab,
    'label_vocab': label_vocab
}

torch.save(state_dict, 'state_dict.pt')