# Load Data

In [1]:
# Installing libraries
import collections
import pathlib
import re
import string

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import tensorflow_datasets as tfds

## GLOBAL Variables

In [17]:
BATCH_SIZE = 32
SEED = 25
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 250

In [2]:
!pip3 install tensorflow-text



In [3]:
import tensorflow_text as tf_text

In [4]:
data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
dataset = utils.get_file(
    'stack_overflow_16k.tar.gz',
    data_url,
    untar=True,
    cache_dir='stack_overflow',
    cache_subdir=''
)
dataset_dir = pathlib.Path(dataset).parent

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz


In [5]:
list(dataset_dir.iterdir())

[PosixPath('/tmp/.keras/README.md'),
 PosixPath('/tmp/.keras/test'),
 PosixPath('/tmp/.keras/train'),
 PosixPath('/tmp/.keras/stack_overflow_16k.tar.gz.tar.gz')]

In [6]:
# define train folder
train_dir = dataset_dir/'train'
list(train_dir.iterdir())

[PosixPath('/tmp/.keras/train/csharp'),
 PosixPath('/tmp/.keras/train/javascript'),
 PosixPath('/tmp/.keras/train/python'),
 PosixPath('/tmp/.keras/train/java')]

In [7]:
sample = train_dir/'python/25.txt'
with open(sample) as f:
    print(f.read())

"error in cv::imshow i am trying to run this blank program but it gives an error..i don't understand what is the error and how to solve it...error:..error: ........opencvmoduleshighguisrcwindow.cpp:261: error: (-215) .    size.width&gt;0 &amp;&amp; size.height&gt;0 .in function cv::imshow...source code:..import cv2.import matplotlib.pyplot as plt..#original img.img = cv2.imread('1.jpeg').#gray img.img1 = cv2.imread('1.jpeg',0).#display img.cv2.imshow('img',img)..cv2.imshow('gray img',img1)..#view image size or shape.print (img.shape).print(img1.size)..#number of pixels.print(img.size).print(img1.size).#graph ."""""".x1=[6,2,4,3].x2=[2,3,4,5].plt.scatter(x1,x2).plt.show().""""""..#write an image.cv2.imwrite('gray_image.jpeg',img1)..#access specific index.print(img[150,150])..#modify the pixel value.img[150,150] = 30.print (img[150,150])..#crop image.crop = img[100:150,100:150]..cv2.imshow('cropped image',crop)..#img will display till press the enter.cv2.waitkey(0)...the error is in line

In [9]:
raw_train_ds = preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset='training',
    seed=SEED
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [10]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(10):
        print("Question: ", text_batch.numpy()[i][:100], '...')
        print("Label: ", label_batch.numpy()[i])

Question:  b'"blank only - sort a bunch of divs .  possible duplicate:.  easiest way to sort dom nodes?  .....i h' ...
Label:  2
Question:  b'blank- document/window has focus? .  possible duplicate:.  how do i find out which blank element has' ...
Label:  2
Question:  b'"making a pyramide trying to make a pyramide here, but i\'m new to blank and a little stuck...the pyr' ...
Label:  1
Question:  b'"is this creating 12 string objects? i\'m trying to understand if this code below creates 12 objects ' ...
Label:  1
Question:  b'"tabs n in list for blank i have simple script in blank, want return per line the values..tabs = # a' ...
Label:  3
Question:  b'"typeerror: \'dict_values\' object does not support indexing while calling layer from combo box- pyqgi' ...
Label:  3
Question:  b'"blank- have to model a circle okay so i don\'t know if any of you like helping homework at all, but ' ...
Label:  1
Question:  b'"how can i pass function\'s return value as parameter? i have lots of functions

In [11]:
for i, label in enumerate(raw_train_ds.class_names):
    print("Label: {} \tCorresponds to: {}".format(i, label))

Label: 0 	Corresponds to: csharp
Label: 1 	Corresponds to: java
Label: 2 	Corresponds to: javascript
Label: 3 	Corresponds to: python


In [12]:
raw_val_ds = preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset='validation',
    seed=SEED
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [13]:
test_dir = dataset_dir/'test'
raw_test_ds = preprocessing.text_dataset_from_directory(
    test_dir, 
    batch_size=BATCH_SIZE
)

Found 8000 files belonging to 4 classes.


## Training Segment

In [16]:
binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary'
)

In [18]:
int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH
)

In [19]:
train_text = raw_train_ds.map(lambda text, labels: text)
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [20]:
def binary_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return binary_vectorize_layer(text), label

In [21]:
def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

In [24]:
# Retrieve a batch from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]

print("Question: {}".format(first_question))
print("\nLabel: {}".format(first_label))

Question: b'"blank comparing parentheses from user input and making sure they have a pair i\'m writing a program to take a user input of parentheses i.e. {} [] () and checking to see if they have a pair (opening and closing). i\'m running into an error when running my code where i always get the return false. i\'ve tried different ways of checking against a pre set ""list"" but it doesn\'t seem to work. i have to use the class from above too. any help is appreciated. ..some example inputs are:..    &gt;&gt;&gt;parenthesesmatch(\'{[]}\').    true.    &gt;&gt;&gt;parenthesesmatch(\'({})\').    true.    &gt;&gt;&gt;parenthesesmatch(\'{[)]}\').    false...my code:..    #george flamburis...class stack():.def __init__(self,que=[]):.    self.lst = que.def __repr__(self):.    return ""stack({})"".format(self.lst).def push(self, add):.    self.lst.append(add).def pop(self):.    return self.lst.pop().def isempty(self):.    return self.lst==[].def first(self, loc=0):            #naming of this me

In [25]:
print("'binary' vectorized question:", 
      binary_vectorize_text(first_question, first_label)[0])

'binary' vectorized question: tf.Tensor([[1. 1. 1. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)


In [26]:
print("'int' vectorized question:",
      int_vectorize_text(first_question, first_label)[0])

'int' vectorized question: tf.Tensor(
[[  16 1091 3851   32   99   91    8  462  223  195   17    5 1312   51
   408    5   87    4  353    5   99   91    9 3851  421    8  790    4
   184   11  195   17    5 1312 1805    8 2015   51  292   97   31   67
    46  292   23   29  132    3  343   41    2   26  106  193  145  182
   948    9  790 1563    5 6301  103   59   27   10  185  495    4  137
     3   17    4   71    2   30   32  251  433   76  107    6  562   85
   142  762   61    1   89    1   89    1    1   29 3658    1    1    1
     1    1 4989   26    1    1  122    1    1   26    1    1   26    1
     1    1 5068    9   13   64  166   33   26    1 7651   26    1    1
   176  837  206   12  283    7  653   11  283    7    1   53    1   11
     1   12  344    7    1   11    1    8    1   21    7   45   45   26
   106   26   89  287    1   26  106    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0

In [27]:
print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[313])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

1289 --->  report
313 --->  put
Vocabulary size: 10000


In [28]:
# Generating clusters
binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_val_ds = raw_val_ds.map(binary_vectorize_text)
binary_test_ds = raw_test_ds.map(binary_vectorize_text)

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

In [29]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [30]:
# Configuring sets
binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

## Generating model

In [31]:
binary_model = tf.keras.Sequential([layers.Dense(4)])

In [32]:
binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

In [34]:
history = binary_model.fit(binary_train_ds, validation_data=binary_val_ds, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


## Creating a model with `int vector`

In [37]:
def create_model(vocab_size, num_labels):
    model=tf.keras.Sequential([
        layers.Embedding(vocab_size, 64, mask_zero=True),
        layers.Conv1D(64, 5, padding='valid', activation='relu', strides=2),
        layers.GlobalMaxPool1D(),
        layers.Dense(num_labels)
    ])
    return model

In [38]:
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=4)

In [39]:
int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

In [40]:
history = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Binary vs. Int model

In [41]:
print("Linear model on binary vectorized data:")
print(binary_model.summary())

Linear model on binary vectorized data:
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 40004     
Total params: 40,004
Trainable params: 40,004
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
print("ConvNet model on int vectorized data:")
print(int_model.summary())

ConvNet model on int vectorized data:
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          640064    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 64)          20544     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 260       
Total params: 660,868
Trainable params: 660,868
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
binary_loss, binary_accuracy = binary_model.evaluate(binary_test_ds)
int_loss, int_accuracy = int_model.evaluate(int_test_ds)

print("Binary model accuracy: {:2.2%}".format(binary_accuracy))
print("Int model accuracy: {:2.2%}".format(int_accuracy))

Binary model accuracy: 79.91%
Int model accuracy: 81.64%


In [44]:
export_model = tf.keras.Sequential([
    binary_vectorize_layer, 
    binary_model,
    layers.Activation('sigmoid')
])

In [45]:
export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy']
)

In [48]:
loss, acc = export_model.evaluate(raw_test_ds)
print("Accuracy: {:2.2%}".format(acc))

Accuracy: 79.91%


In [49]:
# Define a function for calculating label
def get_string_labels(predicted_scores_batch):
    predicted_int_labels = tf.argmax(predicted_scores_batch, axis=1)
    predicted_label = tf.gather(raw_train_ds.class_names, predicted_int_labels)
    return predicted_label

In [50]:
inputs = [
    "how do I extract keys from a dict into a list?",  # python
    "debug public static void main(string[] args) {...}",  # java
]
predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)
for input, label in zip(inputs, predicted_labels):
  print("Question: ", input)
  print("Predicted label: ", label.numpy())

Question:  how do I extract keys from a dict into a list?
Predicted label:  b'python'
Question:  debug public static void main(string[] args) {...}
Predicted label:  b'java'


In [62]:
# input text link: https://stackoverflow.com/questions/65482724/how-to-copy-byte-data-into-a-very-large-float
input_text = [
    """I am trying to copy byte[] data into a float[] like this:

float[] floatArr = new float[int32.MaxValue];
byte[] byteArr = new byte[bufferSize];
// fill buffer
Buffer.BlockCopy(byteArr, 0, floatArr, 0, bufferSize);
This works fine, because I am copying it to the start of the destination array. But how can I copy the data to an index greater than int32.MaxValue?""", 
    # text link: https://stackoverflow.com/questions/65482724/how-to-copy-byte-data-into-a-very-large-float
    """I am trying to rewrite some old vb6 code using C-sharp. The problem is that when I used FloodFill in vb it saves the image with the affect of FloodFill. This is not true using C-sharp. Here is the code segment for VB6:""",
    # text link: https://stackoverflow.com/questions/65451723/floodfill-using-c-sharp
    """I am creating a math game and want a score count in the top corner.

I have created the labels:

score = 0

score_addition_easy_label = Label(root7, text="Score count: ")
score_addition_easy_label.place(x=25, y=100)

score_addition_easy_number = Label(root7, text=score)
score_addition_easy_number.place(x=120, y=100)
when the code is run it displays:""", 
    # text link: https://stackoverflow.com/questions/65482933/how-to-change-the-text-of-a-label-tkinter
]

pred = export_model.predict(input_text)
pred_l = get_string_labels(pred)

print("Predicted label: {}".format(pred_l))

Predicted label: [b'csharp' b'csharp' b'python']


### [Burada Kaldık](https://www.tensorflow.org/tutorials/load_data/text#example_2_predict_the_author_of_illiad_translations)
