## Data Preprocessing

In [1]:
import pandas as pd
import re
import numpy as np
import torch

from collections import Counter
from tqdm import tqdm

In [2]:
train_df = pd.read_csv("data/train.csv", encoding='utf-8')

In [3]:
test_df = pd.read_csv("data/test.csv", encoding='utf-8')

In [4]:
train_tweets = train_df['tweet']
train_labels = train_df['label']

In [5]:
train_tweets[0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [6]:
reviews = []

In [7]:
for tweet in train_tweets:
    review = tweet.lower()
    review = re.sub('[^A-Za-z]+', ' ', review)
    reviews.append(review)

In [8]:
print(reviews[0])

 user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run


In [9]:
text = "\n".join(reviews)

# print(text)

In [10]:
len(text.split("\n") ) == len(reviews)

True

In [11]:
total_counts = Counter()

In [12]:
for review in reviews:
    for word in review.split():
        if len(word) > 2:
            total_counts[word] = 1

In [13]:
review_vocab = list(sorted(set(total_counts.keys())))

# review_vocab

In [14]:
label_vocab = list(sorted(set(train_labels)))

# label_vocab

In [15]:
word_to_int = dict()

for index, word in enumerate(review_vocab):
    word_to_int[word] = index

In [16]:
review_arr = np.zeros((len(reviews), len(review_vocab)))

In [17]:
label_to_int = dict()

for index, label in enumerate(label_vocab):
    label_to_int[label] = index

In [18]:
review_arr.shape

(31962, 37105)

In [19]:
for index, review in enumerate(tqdm(reviews)):
    for word in review.split():
        if word in review_vocab:
            review_arr[index][word_to_int[word]] = 1

100%|██████████| 31962/31962 [02:27<00:00, 216.92it/s]


In [20]:
review_tensor = torch.from_numpy(review_arr)

In [21]:
label_tensor = torch.from_numpy(train_labels.to_numpy())

In [22]:
test_reviews = []

for tweet in test_df['tweet']:
    review = tweet.lower()
    review = re.sub('[^A-Za-z]+', ' ', review)
    test_reviews.append(review)

In [23]:
test_review_arr = np.zeros((len(test_reviews), len(review_vocab)))

In [24]:
test_review_arr.shape

(17197, 37105)

In [25]:
for index, review in enumerate(tqdm(test_reviews)):
    for word in review.split():
        if word in review_vocab:
            test_review_arr[index][word_to_int[word]] = 1

100%|██████████| 17197/17197 [01:23<00:00, 207.19it/s]


In [26]:
test_review_tensor = torch.from_numpy(test_review_arr)

In [27]:
test_label_tensor = torch.zeros(len(test_reviews), 1)

In [28]:
test_label_tensor.shape

torch.Size([17197, 1])

In [29]:
state_dict = {
    'review_tensor': review_tensor,
    'label_tensor': label_tensor,
    'test_review_tensor': test_review_tensor,
    'test_label_tensor': test_label_tensor,
    'input_dim': len(review_vocab),
    'output_dim': len(label_vocab),
    'review_vocab': review_vocab,
    'label_vocab': label_vocab,
    'reviews': reviews,
    'labels': train_labels.to_numpy(),
    'test_reviews': test_reviews
}

torch.save(state_dict, 'state_dict.pt')