# A. Data Preprocessing
 - **INPUT**: Tokenized/segmented texts (using VnCoreNLP in JAVA) of 2 classes: _TECH_ and _NON-TECH_
 - **STEPs**:
   - Remove digits
   - Remove stopwords
   - Remove special/"weird" patterns

In [5]:
# load tokenized texts and put into a dataframe
from utilities import read_file
import os


# get file names in TECH folder
indir = 'data/tokenized/TECH/'
file_names = os.listdir(indir)

tech_texts = list()
for file_name in file_names:
    file = os.path.join(indir, file_name)
    tech_texts += [read_file(file)]


# get file names in NON-TECH folder
indir = 'data/tokenized/NON-TECH/'
file_names = os.listdir(indir)

non_tech_texts = list()
for file_name in file_names:
    file = os.path.join(indir, file_name)
    non_tech_texts += [read_file(file)]

### 1. Define stopwords list using "vietnamese-stopwords-dash.txt"

In [2]:
filename = 'vietnamese-stopwords-dash.txt'
## consider puntuations and one-character words as stopwords to remove
stopwords = list('!@#$%^&*()_+-=<>?,./:\'''“”;abcdefghijklmnopqrstuvwxyz')
stopwords = (open(filename)).read().split('\n') + stopwords

Remove digits, weird-patterns & stopwords, using an outside function in **utilities.py**.

In [10]:
from utilities import preprocess_tokenized_text

tech_tokens = list(map(
    lambda tokenized_text:preprocess_tokenized_text(tokenized_text,
                                                    stopwords),
    tech_texts[:]))


non_tech_tokens = list(map(
    lambda tokenized_text:preprocess_tokenized_text(tokenized_text,
                                                    stopwords),
    non_tech_texts[:]))

### 2. Prepare a classification dataframe

In [19]:
import pandas as pd
import numpy as np


df_tech = pd.DataFrame({
    "text": tech_texts,
    'token': tech_tokens,
    'target': np.ones_like(tech_tokens)
})

df_non_tech = pd.DataFrame({
    "text": non_tech_texts,
    'token': non_tech_tokens,
    'target': np.zeros_like(non_tech_tokens)
})

df = df_tech.append(df_non_tech, ignore_index=True)


# save file
from utilities import save_pkl
outfile = 'data/clf_data.pkl'
save_pkl(df, outfile)

# B. Train / Test Split

In [20]:
from sklearn.model_selection import train_test_split
y = np.array(df.target)
X = np.array(df.loc[:, ['text','token']])

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [None]:
# save train data
df_train = pd.DataFrame(columns=['target', 'text', 'tokens'])
df_train.target = y_train
df_train.text = X_train[:,0]
df_train.tokens = X_train[:,1]

outfile = 'data/data_train.pkl'
save_pkl(df_train, outfile)


# save test data
df_test = pd.DataFrame(columns=['target', 'text', 'tokens'])
df_test.target = y_test
df_test.text = X_test[:,0]
df_test.tokens = X_test[:,1]

outfile = 'data/data_test.pkl'
save_pkl(df_test, outfile)