<a href="https://colab.research.google.com/github/younesabdolmalaky/A-Dual-Channel-Approach-for-Farsi-Text-Classification-using-Transfer-Learning-Techniques/blob/main/notebooks/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download bittlingmayer/amazonreviews
! unzip amazonreviews.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading amazonreviews.zip to /content
 99% 487M/493M [00:04<00:00, 155MB/s]
100% 493M/493M [00:04<00:00, 116MB/s]
Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [4]:
import numpy as np
import bz2
import re
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('test.ft.txt.bz2')


In [6]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts

train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [8]:
vectorizer = TfidfVectorizer(max_features = 10000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [9]:
with open('/content/drive/MyDrive/persian-sentiment-analysis/X_train_tfidf.pickle', 'wb') as handle:
    pickle.dump(X_train ,handle)


with open('/content/drive/MyDrive/persian-sentiment-analysis/X_test_tfidf.pickle', 'wb') as handle:
    pickle.dump(X_test, handle)


with open('/content/drive/MyDrive/persian-sentiment-analysis/vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle)

In [None]:
del X_train
del X_test
del vectorizer

In [None]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)

In [None]:
train_texts = tokenizer.texts_to_sequences(train_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [None]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

In [None]:
MAX_LENGTH

In [None]:
with open('/content/drive/MyDrive/persian-sentiment-analysis/train_pad_sequences.pickle', 'wb') as handle:
    pickle.dump(train_texts, handle)

with open('/content/drive/MyDrive/persian-sentiment-analysis/train_labels.pickle', 'wb') as handle:
    pickle.dump(train_labels, handle)

with open('/content/drive/MyDrive/persian-sentiment-analysis/test_pad_sequences.pickle', 'wb') as handle:
    pickle.dump(test_texts, handle)

with open('/content/drive/MyDrive/persian-sentiment-analysis/test_labels.pickle', 'wb') as handle:
    pickle.dump(test_labels, handle)

with open('/content/drive/MyDrive/persian-sentiment-analysis/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)



In [None]:
del train_texts
del train_labels
del test_texts
del test_labels