In [1]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, append=True)

import numpy as np
np.random.seed(1988)

import torch
torch.manual_seed(1988)
torch.set_deterministic(True)

import random 
random.seed(1988) 

In [2]:
# Importar los archivos y crear 2 listas.
# reviews, con las lineas de texto
# labels,  con la correspondiente clase

with open('./raw/badqueries.unix', 'r', encoding='utf-8') as dataset_file:
    bads = dataset_file.readlines()
with open('./raw/goodqueries.unix', 'r', encoding='utf-8') as dataset_file:
    goods = dataset_file.readlines()
    
# Concateno las listas    
reviews = bads + goods
# Creo 2 listas con [1|0] de acuerdo al tamanio de cada dataset
labels  = ([1] * len(bads)) + ([0] * len(goods))

In [3]:
print("Cantidad de Registros Malos [{}]".format(len(bads)))
print("Cantidad de Registros Buenos [{}]".format(len(goods)))
print("Cantidad de Registros Registros Totales [{}]".format(len(reviews)))
print("Cantidad de Registros Labels Totales [{}]".format(len(labels)))

Cantidad de Registros Malos [48126]
Cantidad de Registros Buenos [1294531]
Cantidad de Registros Registros Totales [1342657]
Cantidad de Registros Labels Totales [1342657]


In [4]:
# Importamos los simbolos de puntuacion para poder eliminarlos de los registros
from string import punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
# Generamos un array con todas las palabras del dataset completo
# Se eliminan todos los posibles simbolos de puntuacion y se dejan 
# lo que son palabras / numeros
all_reviews=list()
for text in reviews:
  text = text.lower()
  for a in punctuation:
      text = text.replace(a, ' ')
  all_reviews.append(text)
all_text = " ".join(all_reviews)
all_words = all_text.split()

In [6]:
from collections import Counter 

# Vamos a contar todas las palabras que hay contando el metodo counter
count_words  = Counter(all_words)      # Array con el distinct de palabras
total_words  = len(all_words)          # Numero total de palabras en el datase
sorted_words = count_words.most_common(total_words) # Lista ordenadas por cantidad de ocurrencias de palabras unicas 

# Al final obtenemos un dict en donde el key es la palabra y el valor es numerador de palabra
# Esta lista la vamos a usar luego para reemplazar las palabras por su numero dentro de las 
# posibles palabras y usar los numeros en el tensor
vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}

In [7]:
print("Cantidad de Palabras Diferentes    [{}]".format(len(count_words)))
print("Cantidad de Palabras En el dataset [{}]".format(total_words))
from pprint import pprint

Cantidad de Palabras Diferentes    [780630]
Cantidad de Palabras En el dataset [3238625]


In [8]:
# El objetivo es pasar la lista inicial de palabras del dataset y reemplazar la palabra
# por su valor numerico de palabra del diccionario de transformacion
# Al final encoded_reviews va a tener una lista con las lineas del dataset encoded
# en formato numerico

encoded_reviews=list()
for review in all_reviews:
  encoded_review=list()
  for word in review.split():
    if word not in vocab_to_int.keys():
      encoded_review.append(0) # Si la palabra no existe poner 0 para que no de error
    else:
      encoded_review.append(vocab_to_int[word])
  encoded_reviews.append(encoded_review)

In [9]:
# Ahora vamos a crear un vector, con la cantidad constante de numeros para todas las lineas
# Esto porque este va a ser la entrada a la red y tiene que ser un numero constante

sequence_length=300
features=np.zeros((len(encoded_reviews), sequence_length), dtype=int)
for i, review in enumerate(encoded_reviews):
  review_len=len(review)
  if (review_len<=sequence_length):
    zeros=list(np.zeros(sequence_length-review_len))
    new=zeros+review
  else:
    new=review[:sequence_length]
  features[i,:]=np.array(new)

In [10]:
# Split los dataset, 80% para training y 20% para test
from sklearn.model_selection import train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(features, labels, test_size=0.2, random_state=666)

# La distribucion de clases se tiene que mantener (96% Clase 0)
print("Numero de Elementos [{}] Numero de Clase 0 [{}] Perc [{}]".format(len(train_y), train_y.count(0), train_y.count(0)/len(train_y)))
print("Numero de Elementos [{}] Numero de Clase 0 [{}] Perc [{}]".format(len(valid_y), valid_y.count(0), valid_y.count(0)/len(valid_y)))

Numero de Elementos [1074125] Numero de Clase 0 [1035675] Perc [0.9642034213895031]
Numero de Elementos [268532] Numero de Clase 0 [258856] Perc [0.9639670504818793]


In [11]:
import torch
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.FloatTensor(train_x), torch.FloatTensor(train_y))
valid_data=TensorDataset(torch.FloatTensor(valid_x), torch.FloatTensor(valid_y))

#dataloader
batch_size=32
trainloader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
validloader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)

In [12]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [13]:
class Classifier(torch.nn.Module):  
    def __init__(self, in_features, out_features):
        super(Classifier, self).__init__()
        self.fc = torch.nn.Linear(in_features, out_features)
                
    def forward(self, x):
        logits = self.fc(x)
        probs = F.relu(logits)  
        return probs, logits

In [14]:
classifier = Classifier(in_features = 300, out_features = 2)
print(classifier)

Classifier(
  (fc): Linear(in_features=300, out_features=2, bias=True)
)


In [15]:
import torch.optim as optim
optim = optim.SGD(params = classifier.parameters(), lr = 0.001)

In [16]:
from torch.utils.tensorboard import SummaryWriter

In [45]:
import torch.nn.functional as F
from tqdm import tqdm

writer = SummaryWriter()

iter     = 0
epochs   = 2

acc_list  = list()
loss_list = list()

for epoch in range(epochs):
    for inputs, labels in tqdm(trainloader):
        iter += 1
        
        classifier.train()
        optim.zero_grad()

        # Predict sentiment probabilities
        probs, logits = classifier(inputs)

        target  = torch.tensor(labels, requires_grad=True)
        loss    = (( probs.argmax(dim=1) -  target)**2).sum()
        
        preds   = probs.argmax(dim=1)
        targets = target
        
        accuracy = (preds == targets).sum().float()
        accuracy = 100 * (accuracy / batch_size)
        acc_list.append(accuracy)
        loss_list.append(loss)

        loss.backward()
        optim.step()

        writer.add_scalar('train/loss', loss,     iter )
        writer.add_scalar('train/acc',  accuracy, iter )     
    


100%|██████████| 33567/33567 [00:40<00:00, 830.76it/s]
100%|██████████| 33567/33567 [00:39<00:00, 860.24it/s]


In [46]:
val_iter = 0
acc_val = list()
loss_val = list()
classifier.eval()
for inputs, labels in tqdm(validloader):
    val_iter =+ 1

    probs, logits = classifier(inputs)
    loss = ((probs.argmax(dim=1) -  labels)**2).sum().float()

    preds   = probs.argmax(dim=1)
    targets = labels

    accuracy = preds.eq(targets).sum().float()
    accuracy = 100 * (accuracy / batch_size)

    # Log to tensorboard
    writer.add_scalar('val/loss', loss,     val_iter)
    writer.add_scalar('val/acc',  accuracy, val_iter)
    acc_val.append(accuracy)
    loss_val.append(loss)




100%|██████████| 8392/8392 [00:07<00:00, 1105.08it/s]


In [47]:
writer.close()

In [49]:
print("Accuracy en Validacion [{}]".format((sum(acc_val)/len(acc_val)).float()))

Accuracy en Validacion [94.30521392822266]


In [50]:
aaa = 0
for iter, (vectors, targets) in enumerate(trainloader):
    aaa = iter
print(aaa)

33566


In [51]:
epoch * len(trainloader) + aaa

67133