In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('movie-review-dataset.csv')
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment,label
0,"I absolutely loved this movie, the acting was ...",positive,2
1,The plot was weak and predictable.,negative,0
2,"It was an okay film, nothing too memorable.",neutral,1
3,Fantastic cinematography and great soundtrack!,positive,2
4,The movie was way too long and boring.,negative,0


In [2]:
sentences = [
    "I love this movie",
    "This film is terrible",
    "very terrible movie"
]

tokens = [word.lower().split() for word in sentences]

all_words = sorted(set(word for item in tokens for word in item))

vocab = {word: idx+2 for idx, word in enumerate(all_words)}

vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def vectorizer(text):
    word_token = text.lower().split()
    return [vocab.get(word, vocab['<UNK>']) for word in word_token]

vector_tokens = [vectorizer(item) for item in sentences]

vector_tokens

[[3, 5, 8, 6], [8, 2, 4, 7], [9, 7, 6]]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer()
matrix = count_vectorizer.fit_transform(sentences)
count_vocab = count_vectorizer.get_feature_names_out()
array_matrix = matrix.toarray()

In [4]:
print(count_vocab)

['film' 'is' 'love' 'movie' 'terrible' 'this' 'very']


In [6]:
for item in array_matrix:
  print(item.tolist())

[0, 0, 1, 1, 0, 1, 0]
[1, 1, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 1]


In [7]:
tdf_vectorizer = TfidfVectorizer()
tdf_matrix = tdf_vectorizer.fit_transform(sentences)
tdf_vocab = tdf_vectorizer.get_feature_names_out()
array_tdf = tdf_matrix.toarray()

In [8]:
print(tdf_vocab)

['film' 'is' 'love' 'movie' 'terrible' 'this' 'very']


In [9]:
for item in array_tdf:
  print(item.tolist())

[0.0, 0.0, 0.680918560398684, 0.5178561161676974, 0.0, 0.5178561161676974, 0.0]
[0.5628290964997665, 0.5628290964997665, 0.0, 0.0, 0.4280460350631185, 0.4280460350631185, 0.0]
[0.0, 0.0, 0.0, 0.5178561161676974, 0.5178561161676974, 0.0, 0.680918560398684]


In [20]:
from torch.nn.utils.rnn import pad_sequence
import torch as tt

tensor_tokens = [tt.tensor(item, dtype=tt.long) for item in vector_tokens]
padder = pad_sequence(tensor_tokens, batch_first=True, padding_value=vocab['<PAD>'])

In [21]:
import torch.nn as nn

embed_dim = 4
vocab_len = len(vocab)

embedder_encoder = nn.Embedding(num_embeddings=vocab_len, embedding_dim=embed_dim, padding_idx=vocab['<PAD>'])

embedded_vector = embedder_encoder(padder)

In [23]:
embedded_vector, embedded_vector.shape

(tensor([[[ 0.4236, -0.1169, -0.8502, -0.0110],
          [-0.0871, -0.0061, -1.7055, -1.2549],
          [-0.7980, -0.4085,  1.4400,  0.4815],
          [ 0.0163,  1.0039,  0.1616,  0.6508]],
 
         [[-0.7980, -0.4085,  1.4400,  0.4815],
          [-0.2158,  1.0200,  0.7933,  0.0708],
          [-0.8643,  1.2187, -0.3154, -0.8860],
          [ 1.4429,  0.8388,  0.1403, -0.1048]],
 
         [[-0.1429,  0.1960, -0.3816, -0.2327],
          [ 1.4429,  0.8388,  0.1403, -0.1048],
          [ 0.0163,  1.0039,  0.1616,  0.6508],
          [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>),
 torch.Size([3, 4, 4]))

In [24]:
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(text):
    """
    Tokenize, lemmatize, remove stopwords (except critical sentiment words),
    punctuation, numbers, non-alpha tokens.
    """
    doc = nlp(text)
    tokens = [ token.lemma_.lower() for token in doc
        if (token.is_alpha and not token.like_num and not token.is_punct and not token.is_space)
    ]
    return tokens

In [25]:
custom_bow = CountVectorizer(
    stop_words='english', #custom stopword file and built function
    ngram_range=(1, 2),
    lowercase=True
)

custom_tdf = TfidfVectorizer(
    stop_words=None, #handle by spacy tokenizer
    ngram_range=(1, 2),
    tokenizer=spacy_tokenizer
)