<a href="https://colab.research.google.com/github/thaCripple/Simple-NNs/blob/main/tf%20MulticlassNLP%20with%20sklearn%20preprocessing/Multiclass_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiclass NLP Classification of Stackoverflow questions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

In [None]:
import os

# Download the Data

In [None]:
data_url = "https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"

In [None]:
data_path = keras.utils.get_file(
    cache_dir="/content/questions",
    origin=data_url,
    extract=True)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz
[1m6053168/6053168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
data_path

'/content/questions/datasets/stack_overflow_16k.tar.gz'

In [None]:
os.listdir(data_path)

['README.md', 'train', 'test']

## Try to read files into a DF

Read csharp training questions

In [None]:
csharp_train = []
for txt in os.listdir(f"{data_path}/train/csharp"):
  txt_path = f"{data_path}/train/csharp/{txt}"
  with open(txt_path, "r") as question:
    csharp_train.append(question.read())

In [None]:
len(csharp_train)

2000

Read java training questions

In [None]:
java_train = []
for txt in os.listdir(f"{data_path}/train/java"):
  txt_path = f"{data_path}/train/java/{txt}"
  with open(txt_path, "r") as question:
    java_train.append(question.read())

In [148]:
len(java_train)

2000

Read javascript training questions

In [None]:
javascript_train = []
for txt in os.listdir(f"{data_path}/train/javascript"):
  txt_path = f"{data_path}/train/javascript/{txt}"
  with open(txt_path, "r") as question:
    javascript_train.append(question.read())

In [None]:
len(javascript_train)

2000

Read python training questions

In [None]:
python_train = []
for txt in os.listdir(f"{data_path}/train/python"):
  txt_path = f"{data_path}/train/python/{txt}"
  with open(txt_path, "r") as question:
    python_train.append(question.read())

In [None]:
len(python_train)

2000

## Read test questions

Read csharp test questions

In [None]:
csharp_test = []
for txt in os.listdir(f"{data_path}/test/csharp"):
  txt_path = f"{data_path}/test/csharp/{txt}"
  with open(txt_path, "r") as question:
    csharp_test.append(question.read())

In [None]:
len(csharp_test)

2000

Read java test questions

In [None]:
java_test = []
for txt in os.listdir(f"{data_path}/test/java"):
  txt_path = f"{data_path}/test/java/{txt}"
  with open(txt_path, "r") as question:
    java_test.append(question.read())

In [None]:
len(java_test)

2000

Read javascript test questions

In [None]:
javascript_test = []
for txt in os.listdir(f"{data_path}/test/javascript"):
  txt_path = f"{data_path}/test/javascript/{txt}"
  with open(txt_path, "r") as question:
    javascript_test.append(question.read())

In [None]:
len(javascript_test)

2000

Read python test questions

In [None]:
python_test = []
for txt in os.listdir(f"{data_path}/test/python"):
  txt_path = f"{data_path}/test/python/{txt}"
  with open(txt_path, "r") as question:
    python_test.append(question.read())

In [None]:
len(python_test)

2000

## Create Dataframes

### Training

In [None]:
csharp_train_df = pd.DataFrame(data=csharp_train, columns=["question"])

In [None]:
csharp_train_df["language"] = "csharp"

In [None]:
java_train_df = pd.DataFrame(data=java_train, columns=["question"])

In [None]:
java_train_df["language"] = "java"

In [None]:
javascript_train_df = pd.DataFrame(data=javascript_train, columns=["question"])

In [None]:
javascript_train_df["language"] = "javascript"

In [None]:
python_train_df = pd.DataFrame(data=python_train, columns=["question"])

In [None]:
python_train_df["language"] = "python"

Training df

In [None]:
train_df = pd.concat([csharp_train_df, java_train_df, javascript_train_df, python_train_df], axis=0)

In [None]:
train_df = train_df.astype({"language": "category"})

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   question  8000 non-null   object  
 1   language  8000 non-null   category
dtypes: category(1), object(1)
memory usage: 133.0+ KB


### Testing

In [None]:
csharp_test_df = pd.DataFrame(data=csharp_test, columns=["question"])

In [None]:
csharp_test_df["language"] = "csharp"

In [None]:
java_test_df = pd.DataFrame(data=java_test, columns=["question"])

In [None]:
java_test_df["language"] = "java"

In [None]:
javascript_test_df = pd.DataFrame(data=javascript_test, columns=["question"])

In [None]:
javascript_test_df["language"] = "javascript"

In [None]:
python_test_df = pd.DataFrame(data=python_test, columns=["question"])

In [None]:
python_test_df["language"] = "python"

Test df

In [None]:
test_df = pd.concat([csharp_test_df, java_test_df, javascript_test_df, python_test_df], axis=0)

In [None]:
test_df = test_df.astype({"language": "category"})

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   question  8000 non-null   object  
 1   language  8000 non-null   category
dtypes: category(1), object(1)
memory usage: 133.0+ KB


# Data Analysis

## Calculate the number of samples/number of words per sample ratio

Words per sample

In [None]:
def count_words(question):
  num_words = len(question.split())
  return num_words

In [None]:
train_df["n_words"] = train_df['question'].apply(func=count_words)

In [None]:
print(f"The Ratio equals: {train_df.shape[0]/train_df['n_words'].mean()}")

The Ratio equals: 58.36992854243905


# Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [195]:
questions_raw = train_df['question'].copy()
labels_raw = train_df['language'].copy()

In [None]:
lbl_encoder = LabelEncoder()

In [None]:
labels_df = lbl_encoder.fit_transform(labels_raw)

In [None]:
X_train_raw, X_val_raw, y_train, y_val = train_test_split(questions_raw, labels_df, test_size=0.2, random_state=1)

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    strip_accents='unicode',
    decode_error='strict',
    min_df=2
)

In [None]:
selector = SelectKBest(f_classif, k=20000)

### Create a Feature Preprocessing Pipeline

In [None]:
feat_pipe1 = Pipeline(steps=[
    ("tf_idf", vectorizer),
    ("feature_selector", selector)
])

In [None]:
X_train_raw.head()

Unnamed: 0,question
375,"""how to detect change in hour of system time i..."
255,"""cannot implicitly convert type 'newtonsoft.js..."
1670,"""keep asking for a number until a even number ..."
1882,"""how to store and retrieve 4 sbyte values in a..."
1946,"""blank finding the largest integer in an array..."


Preprocess the features

In [None]:
X_train = feat_pipe1.fit_transform(X=X_train_raw, y=y_train).astype('float32')

In [None]:
X_val = feat_pipe1.transform(X=X_val_raw).astype('float32')

## Create datasets

We got a sparse matrix from our tf-idf vectorizer and now need to convert it into a sparse tensor before creating a dataset

In [154]:
import scipy

In [182]:
def create_sparse_tensor(sprs_feats):
  """
  Convert a tf-idf vectorized sparse array of features into a sparse tensor suitable for
  the tf.Dataset.from_slices constructor
  """

  idx, idy, vals = scipy.sparse.find(sprs_feats) # Row, Col and values from the sparse array
  inds = tf.stack([idx, idy], axis=1) # Find indicies for tf sparse tensor
  inds = tf.cast(inds, tf.int64)
  shape = sprs_feats.shape # Sparse array shape
  sparse_tensor = tf.sparse.SparseTensor(indices=inds, values=vals, dense_shape=shape)

  return sparse_tensor

In [183]:
X_train_tens = create_sparse_tensor(X_train)

In [185]:
X_val_tens = create_sparse_tensor(X_val)

Create the training and validation datasets

In [186]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_tens, y_train))

In [187]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val_tens, y_val))

In [None]:
# Data was already shuffled by the sklearn.train_test_split

In [188]:
train_ds = train_ds.batch(batch_size=64).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = val_ds.batch(batch_size=64).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

# Create a Simple Model

In [207]:
model1 = keras.Sequential(name="Model1")
model1.add(layer=keras.layers.Dropout(rate=.2, name="Dropout1"))
model1.add(layer=keras.layers.Dense(units=64, activation="relu", name="Dense1"))
model1.add(layer=keras.layers.Dense(units=32, activation="relu", name="Dense2"))
model1.add(layer=keras.layers.Dense(units=4, name="Output")) # Deliberately omitting the activation function in the output layer to be able to use logits for loss calculations

model1.compile(
    optimizer=keras.optimizers.Adam(learning_rate=.001),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"])

In [208]:
model1.summary()

In [209]:
model1_hist = model1.fit(
    x=train_ds,
    validation_data=val_ds,
    epochs=2
)

Epoch 1/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.5135 - loss: 1.2709 - val_accuracy: 0.8400 - val_loss: 0.6581
Epoch 2/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.9063 - loss: 0.4681 - val_accuracy: 0.8612 - val_loss: 0.4016


# Now let's test the data pipeline on the testing subset

### Preprocessing

We start with a simple dataframe

In [193]:
test_df.head()

Unnamed: 0,question,language
0,"""is it possible to move the caret to the first...",csharp
1,"""how to activate different sections of code in...",csharp
2,application crash my application uses twain dl...,csharp
3,"""database controller: login i am very new to b...",csharp
4,"""how to calculate a persons age at a given dat...",csharp


In [194]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   question  8000 non-null   object  
 1   language  8000 non-null   category
dtypes: category(1), object(1)
memory usage: 133.0+ KB


Separate features and labels

In [196]:
x_test_raw = test_df['question'].copy()
y_test_raw = test_df['language'].copy()

Encode the labels

In [198]:
y_test = lbl_encoder.transform(y_test_raw)

In [199]:
y_test

array([0, 0, 0, ..., 3, 3, 3])

Preprocess the features

Hopefully without any data leakage

In [200]:
x_test = feat_pipe1.transform(x_test_raw)

### Dataset creation

In [201]:
x_test_tens = create_sparse_tensor(x_test)

In [202]:
test_ds = tf.data.Dataset.from_tensor_slices((x_test_tens, y_test))

In [203]:
test_ds = test_ds.batch(batch_size=64).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [204]:
test_ds

<_PrefetchDataset element_spec=(SparseTensorSpec(TensorShape([None, 20000]), tf.float64), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

# Evaluate the model

Just to test the data honestly

In [210]:
model1.evaluate(x=test_ds)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8605 - loss: 0.3992


[0.3679424226284027, 0.8769999742507935]

# Conclusions

* My apprehension towards using a different library for preprocessing and thus decoupling it from the built-in tensorflow workflow has proven ill placed
* In the future encapsulate the entire data preprocessing stage into a single function
* In the future work on more advanced and complex preprocessing steps without worry
* Reading data from files, web etc. works well in Colab; It's worth from now on to create nicer data reading steps than here