In [1]:
"""
    
        Step 1 in the training: we convert the (human-readable) CSV
        with training data into number matrices with the appropriate
        shape, ready for the actual training of the classifier.
"""

import sys
import pickle
import json
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

2024-06-09 12:54:28.668025: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# in
datasetInputPath = 'training/raw_dataset/'
# df
datasetInputFile = 'training/prepared_dataset/aggregated_df.csv'
# out
trainingDumpFile = 'training/prepared_dataset/training_data.pickle'

In [3]:
import pickle
labels = []
texts = []
raw_folds = [fold for fold in  os.listdir(datasetInputPath) if not fold.startswith('.') ]
for raw_folder in raw_folds:
    raw_files = [file for file in os.listdir(os.path.join(datasetInputPath, raw_folder)) if not file.startswith('.')]
    for raw_file in raw_files: 
        with open(os.path.join(*[datasetInputPath, raw_folder, raw_file]), 'r') as f:
            try:
                texts.append(''.join([line.strip() for line in f.readlines()]))
                labels.append(raw_folder)
            except:
                print("ERROR IN FILE READ {}".format(os.path.join(*[datasetInputPath, raw_folder, raw_file])))


In [4]:
raw_df = pd.DataFrame({"text": texts, "label": labels})


In [5]:
from random import randrange
other_texts = []
other_labels = []
for i in range(100):
    other_text = ""
    ind = randrange(100)
    for label in raw_df['label'].unique():
        source_text = list(raw_df[raw_df['label'] == label]['text'])[ind]
        other_text += ' '.join(source_text.split()[:200])
    other_texts.append(other_text)
    other_labels.append('other')


In [6]:
other_df = pd.DataFrame({"text": other_texts, "label": other_labels})

In [7]:
raw_df = pd.concat([raw_df, other_df])

In [8]:
raw_df = raw_df.sample(frac = 1)

In [9]:
raw_df.to_csv(datasetInputFile)

In [10]:
raw_df['label'].value_counts()

label
technologie      100
sport            100
graphics         100
food             100
politics         100
other            100
business         100
entertainment    100
medical          100
historical       100
space            100
Name: count, dtype: int64

In [11]:
raw_df['text_len'] = raw_df['text'].str.len()

In [12]:
raw_df['text_len'].describe(percentiles = [0.25,0.5,0.75,0.9,0.99])

count     1100.000000
mean      3305.712727
std       4227.345263
min        108.000000
25%       1137.500000
50%       1726.500000
75%       3099.500000
90%      10238.600000
99%      17314.870000
max      51925.000000
Name: text_len, dtype: float64

In [13]:
def _reindent(t, n): 
    return '\n'.join('%s%s' % (' ' * n if ix > 0 else '', l) for ix, l in enumerate(t.split('\n')))


In [14]:
verbose = 1
# Reading the input file and preparing legend info
print('    Reading ... ', end ='')
df = pd.read_csv(datasetInputFile)
labels = df['label'].tolist()
texts = df['text'].tolist()
#
labelLegend = {v:k for k, v in enumerate(raw_df['label'].unique())}
labelLegendInverted = {'%i' % v: k for k,v in labelLegend.items()}
labelsAsInt = [labelLegend[x] for x in labels]
print('done')


# Tokenization of texts
print('    Tokenizing ... ', end ='')
MAX_NUM_WORDS = 500
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print('done')
if verbose:
    print('        tokenizer.word_index             = %s +...' % str(dict(list(tokenizer.word_index.items())[:5])))
    inverseWordIndex = {v: k for k, v in tokenizer.word_index.items()}
    print('        inverseWordIndex                 = %s +...' % str(dict(list(inverseWordIndex.items())[:5])))
    print('        sequences[350]                   = %s' % str(sequences[350]))
    print('        [')
    print('            inverseWordIndex[i]')
    print('            for i in sequences[350]')
    print('        ]                                = %s' % (
        [inverseWordIndex[i] for i in sequences[350]]
    ))
    print('        texts[350]                       = "%s"' % texts[350])

# Padding of sequences
print('    Padding ... ', end ='')
MAX_SEQ_LENGTH = 2000
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
print('done')
if verbose:
    print('        [len(s) for s in sequences]      = %s + ...' % str([len(s) for s in sequences[:6]]))
    print('        len(sequences)                   = %s' % str(len(sequences)))
    print('        X.shape                          = %s' % str(X.shape))
    print('        type(X)                          = %s' % str(type(X)))
    print('        X[350]                           = ... + %s' % str(X[350][285:]))

# Switch to categorical form for labels
print('    Casting as categorical ... ', end ='')
labelsAsIntArray = np.asarray(labelsAsInt)
y = to_categorical(labelsAsIntArray)
print('done')
if verbose:
    print('        labelsAsIntArray.shape           = %s' % str(labelsAsIntArray.shape))
    print('        y.shape                          = %s' % str(y.shape))
    print('        y[:5]                            = %s' % _reindent(str(y[:5]),43))
    print('        labels[:5]                       = %s' % str(labels[:5]))
    print('        labelLegend                      = %s' % str(labelLegend))

print('    Splitting dataset ... ', end ='')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print('done')
if verbose:
    print('        X_train.shape = %s' % str(X_train.shape))
    print('        X_test.shape  = %s' % str(X_test.shape))
    print('        y_train.shape = %s' % str(y_train.shape))
    print('        y_test.shape  = %s' % str(y_test.shape))
    # Respectively: (5043, 300) (2485, 300) (5043, 2) (2485, 2)

print('    Saving ... ', end ='')

trainingData = {
    'X_train': X_train, 
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'max_words': MAX_NUM_WORDS,
    'max_seq_length': MAX_SEQ_LENGTH,
    'label_legend': labelLegend,
    'label_legend_inverted': labelLegendInverted, 
    'tokenizer': tokenizer,
}
with open(trainingDumpFile, 'wb') as f:
    pickle.dump(trainingData, f)
print('done')
if verbose:
    print('        Saved keys = %s' % '/'.join(sorted(trainingData.keys())))
#
print('FINISHED')

    Reading ... done
    Tokenizing ... done
        tokenizer.word_index             = {'the': 1, 'of': 2, 'to': 3, 'and': 4, 'in': 5} +...
        inverseWordIndex                 = {1: 'the', 2: 'of', 3: 'to', 4: 'and', 5: 'in'} +...
        sequences[350]                   = [22, 1, 88, 3, 136, 6, 5, 239, 137, 131, 263, 9, 6, 489, 37, 263, 49, 454, 37, 15, 47, 3, 238, 3, 15, 13, 306, 263, 223, 1, 2, 7, 263, 76, 9, 353, 18, 84, 66, 6, 131, 25, 13, 68, 95, 353, 5, 1, 197, 270, 141, 10, 4, 6, 269, 4, 8, 7, 3, 96, 4, 312, 75, 8, 3, 454, 75, 263, 30, 49, 46, 1, 255, 43, 1, 255, 149, 4, 13, 6, 10, 96, 75, 8, 84, 66, 1, 10, 2, 201, 9, 34, 79, 4, 1, 455, 129, 83, 8, 50, 98, 104, 1, 8, 142, 7, 4, 70, 11, 3, 6, 7, 3, 1, 149, 7, 242, 2, 372, 13, 79, 9, 2, 86, 7, 378, 6, 97, 8, 84, 6, 7, 91, 263, 3, 99, 10, 1, 4, 55, 8, 38, 327, 3, 14, 10, 33, 11, 6, 155, 149, 5, 75, 4, 75, 59, 55, 5, 3, 116, 416, 75, 391, 93, 14, 4, 75, 5, 47, 110, 16, 6, 131, 8, 3, 6, 4, 110, 16, 416, 7, 6, 130, 4, 38, 96, 7