<a href="https://colab.research.google.com/github/younesabdolmalaky/A-Dual-Channel-Approach-for-Farsi-Text-Classification-using-Transfer-Learning-Techniques/blob/main/notebooks/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download bittlingmayer/amazonreviews
! unzip amazonreviews.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading amazonreviews.zip to /content
100% 492M/493M [00:12<00:00, 44.6MB/s]
100% 493M/493M [00:12<00:00, 40.4MB/s]
Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [None]:
import numpy as np
import bz2
import re
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input , Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras import callbacks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import  f_classif

In [None]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('test.ft.txt.bz2')


In [None]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [None]:
vectorizer = TfidfVectorizer(max_features = 1000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [None]:
with open('X_train_tfidf1.pickle', 'wb') as handle:
    pickle.dump(X_train ,handle)


with open('X_test_tfidf.pickle', 'wb') as handle:
    pickle.dump(X_test, handle)   


with open('vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle)   

In [None]:
del X_train 
del X_test
del vectorizer

In [None]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)

In [None]:
train_texts = tokenizer.texts_to_sequences(train_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [None]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

In [None]:
MAX_LENGTH

255

In [None]:
with open('train_pad_sequences1.pickle', 'wb') as handle:
    pickle.dump(train_texts[0:1800000], handle)

with open('train_pad_sequences2.pickle', 'wb') as handle:
    pickle.dump(train_texts[1800000:], handle)

In [None]:
with open('train_labels.pickle', 'wb') as handle:
    pickle.dump(train_labels, handle)

with open('test_pad_sequences.pickle', 'wb') as handle:
    pickle.dump(test_texts, handle)

with open('test_labels.pickle', 'wb') as handle:
    pickle.dump(test_labels, handle)

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)


In [None]:
del train_texts
del train_labels
del test_texts
del test_labels

In [None]:
from google.colab import drive
from pydrive.auth import GoogleAuth
from google.colab import auth
drive.mount('/content/drive')
auth.authenticate_user()

Mounted at /content/drive


In [None]:
!pip install gupload

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gupload
  Downloading gupload-1.1.0-py3-none-any.whl (4.7 kB)
Collecting google-api-python-client==1.7.10
  Downloading google_api_python_client-1.7.10-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting click==7.0
  Downloading Click-7.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting uritemplate<4dev,>=3.0.0
  Downloading uritemplate-3.0.1-py2.py3-none-any.whl (15 kB)
Installing collected packages: uritemplate, click, google-api-python-client, gupload
  Attempting uninstall: uritemplate
    Found existing installation: uritemplate 4.1.1
    Uninstalling uritemplate-4.1.1:
      Successfully uninstalled uritemplate-4.1.1
  Attempting uninstall: click
    Foun

In [None]:
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/X_train_tfidf1.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/X_test_tfidf.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/vectorizer.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/train_pad_sequences1.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/train_pad_sequences2.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/test_pad_sequences.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/test_labels.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/tokenizer.pickle
!gupload --to '1ReTDMH4U64tjTaTI31Yc-BRMqs3wD7-K' /content/train_labels.pickle

Uploading file: /content/X_train_tfidf1.pickle as: X_train_tfidf1.pickle
^C
Uploading file: /content/X_test_tfidf.pickle as: X_test_tfidf.pickle
Uploading file: /content/vectorizer.pickle as: vectorizer.pickle
Uploading file: /content/train_pad_sequences1.pickle as: train_pad_sequences1.pickle
^C
Uploading file: /content/train_pad_sequences2.pickle as: train_pad_sequences2.pickle
^C
Uploading file: /content/test_pad_sequences.pickle as: test_pad_sequences.pickle
Uploading file: /content/test_labels.pickle as: test_labels.pickle
Uploading file: /content/tokenizer.pickle as: tokenizer.pickle
Uploading file: /content/train_labels.pickle as: train_labels.pickle
