In [None]:
!pip install unidecode

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/74/65/91eab655041e9e92f948cb7302e54962035762ce7b518272ed9d6b269e93/Unidecode-1.1.2-py2.py3-none-any.whl (239kB)
[K     |█▍                              | 10kB 23.9MB/s eta 0:00:01[K     |██▊                             | 20kB 16.9MB/s eta 0:00:01[K     |████                            | 30kB 14.3MB/s eta 0:00:01[K     |█████▌                          | 40kB 13.4MB/s eta 0:00:01[K     |██████▉                         | 51kB 10.6MB/s eta 0:00:01[K     |████████▏                       | 61kB 12.2MB/s eta 0:00:01[K     |█████████▋                      | 71kB 10.7MB/s eta 0:00:01[K     |███████████                     | 81kB 11.6MB/s eta 0:00:01[K     |████████████▎                   | 92kB 11.1MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 10.1MB/s eta 0:00:01[K     |███████████████                 | 112kB 10.1MB/s eta 0:00:01[K     |████████████████▍               | 12

In [None]:
import sys 
import os
import glob
import re
import pandas as pd
import numpy as np
from unidecode import unidecode
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm

import matplotlib as mpl
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split

In [None]:
#dataset retrieved from https://github.com/katakonst/sentiment-analysis-tensorflow/datasets
PATH = "/content/drive/MyDrive/fb_comments/resources_for_sentiment/romanian_movie_reviews.csv"

In [None]:
def read_reviews(path):
  data = pd.read_csv(path)
  data = data[['text', 'label']]
  data.label = data.label.apply(lambda x: 1 if x=="pos" else 0)
  print(data.label.value_counts())
  return data

In [None]:
def preprocess_balancing(data): 
  #50/50 distribution
  data.text = data.text.apply(lambda x: unidecode(x))
  data.text = data.text.apply(lambda x: x.replace("\n", " "))
  positive_texts = data[data.label==1]
  negative_texts = data[data.label==0]
  data_balanced = pd.concat([positive_texts.sample(negative_texts.shape[0]), negative_texts], axis = 0)
  data_balanced = data_balanced.sample(frac = 1, random_state = 42)
  print(data_balanced.label.value_counts())
  return data_balanced
  

In [None]:
data = read_reviews(PATH)
data = preprocess_balancing(data)

1    15653
0    11654
Name: label, dtype: int64
1    11654
0    11654
Name: label, dtype: int64


In [None]:
print(type(data))
print(len(data))
data.head()

<class 'pandas.core.frame.DataFrame'>
23308


Unnamed: 0,text,label
5907,foarte multumit super,1
22651,suspendarea enorma a necredintei este necesara...,0
4265,aceasta serie de televiziune cu animatie este ...,1
25296,recenta editie dvd a good humor man eticheteaz...,0
22166,ce tot spui de violenta? daca mie mi-a placut ...,0


In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.text)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size: {}".format(vocab_size))
max_length = max([len(s.split()) for s in data.text])
print("Maximum token length: {}".format(max_length))

Vocabulary size: 58994
Maximum token length: 373


In [None]:
def preprocess(X_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.lower(X_batch)
  X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-z]", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch


In [None]:
data

Unnamed: 0,text,label
5907,foarte multumit super,1
22651,suspendarea enorma a necredintei este necesara...,0
4265,aceasta serie de televiziune cu animatie este ...,1
25296,recenta editie dvd a good humor man eticheteaz...,0
22166,ce tot spui de violenta? daca mie mi-a placut ...,0
...,...,...
15963,acest lucru ar trebui sa fie unul dintre cele ...,0
25574,"ca un adevarat canadian, evit mereu filmele ca...",0
13190,care a fost distrusa rapid de calitatea proast...,1
11617,o versiune actualizata a unei teme care a fost...,1


In [None]:
x_all = tf.data.Dataset.from_tensor_slices(data.text)
y_all = tf.data.Dataset.from_tensor_slices(data.label)

In [None]:
from collections import Counter
vocabulary = Counter()

for x_batch in x_all.batch(32).map(preprocess):
  for review in x_batch:
    vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:10]

[(b'de', 44594),
 (b'si', 28578),
 (b'a', 25533),
 (b'este', 20946),
 (b'nu', 19697),
 (b'in', 19629),
 (b'sa', 18917),
 (b'un', 18905),
 (b'ca', 18514),
 (b'o', 17919)]

In [None]:
vocab_limit = 20000
truncated_vocab = [word for word, count in vocabulary.most_common()[:vocab_limit]]

In [None]:
words = tf.constant(truncated_vocab)
word_ids = tf.range(len(truncated_vocab), dtype = tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

In [None]:
x_all_preprocessed = x_all.batch(32).map(preprocess)
