<a href="https://colab.research.google.com/github/younesabdolmalaky/A-Dual-Channel-Approach-for-Farsi-Text-Classification-using-Transfer-Learning-Techniques/blob/main/notebooks/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download bittlingmayer/amazonreviews
! unzip amazonreviews.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading amazonreviews.zip to /content
 99% 488M/493M [00:03<00:00, 169MB/s]
100% 493M/493M [00:03<00:00, 166MB/s]
Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [5]:
import numpy as np
import bz2
import re
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input , Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras import callbacks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [22]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('test.ft.txt.bz2')


In [23]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [24]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [25]:
Chi2 = SelectKBest(chi2, k=1000)
X_chi2_train = Chi2.fit_transform(X_train, train_labels)
X_chi2_test = Chi2.transform(X_test)

In [26]:
del X_train 
del X_test

In [27]:
with open('X_chi2_train.pickle', 'wb') as handle:
    pickle.dump(X_chi2_train, handle)

with open('X_chi2_test.pickle', 'wb') as handle:
    pickle.dump(X_chi2_test, handle)

with open('vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle)

with open('Chi2.pickle', 'wb') as handle:
    pickle.dump(Chi2, handle)

In [28]:
del X_chi2_train 
del X_chi2_test
del vectorizer
del Chi2

In [29]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)

In [30]:
train_texts = tokenizer.texts_to_sequences(train_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [31]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

In [32]:
MAX_LENGTH

255

In [33]:
with open('train_pad_sequences.pickle', 'wb') as handle:
    pickle.dump(train_texts, handle)

with open('train_labels.pickle', 'wb') as handle:
    pickle.dump(train_labels, handle)

with open('test_pad_sequences.pickle', 'wb') as handle:
    pickle.dump(test_texts, handle)

with open('test_labels.pickle', 'wb') as handle:
    pickle.dump(test_labels, handle)

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)


In [34]:
del train_texts
del train_labels
del test_texts
del test_labels

In [35]:
from google.colab import drive
from pydrive.auth import GoogleAuth
from google.colab import auth
drive.mount('/content/drive')
auth.authenticate_user()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
!pip install gupload

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/X_chi2_train.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/X_chi2_test.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/vectorizer.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/Chi2.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/train_pad_sequences.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/test_pad_sequences.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/test_labels.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/tokenizer.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/train_labels.pickle

Uploading file: /content/X_chi2_train.pickle as: X_chi2_train.pickle
Uploading file: /content/X_chi2_test.pickle as: X_chi2_test.pickle
Uploading file: /content/vectorizer.pickle as: vectorizer.pickle
Uploading file: /content/Chi2.pickle as: Chi2.pickle
Uploading file: /content/train_pad_sequences.pickle as: train_pad_sequences.pickle
^C
Uploading file: /content/test_pad_sequences.pickle as: test_pad_sequences.pickle
Uploading file: /content/test_labels.pickle as: test_labels.pickle
Uploading file: /content/tokenizer.pickle as: tokenizer.pickle
Uploading file: /content/train_labels.pickle as: train_labels.pickle


In [45]:
!zip -r train_pad_sequences.zip /content/train_pad_sequences.pickle

updating: content/train_pad_sequences.pickle (deflated 87%)


In [47]:
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/train_pad_sequences.zip

Uploading file: /content/train_pad_sequences.zip as: train_pad_sequences.zip
